Commit f1e3135b authored by qianyj's avatar qianyj
Browse files

update TF code

parent f0d87682
Tue Jan 9 09:34:25 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81 Driver Version: 384.81 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla P100-SXM2... On | 00000000:06:00.0 Off | 0 |
| N/A 50C P0 196W / 300W | 15643MiB / 16276MiB | 97% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla P100-SXM2... On | 00000000:07:00.0 Off | 0 |
| N/A 41C P0 50W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 Tesla P100-SXM2... On | 00000000:0A:00.0 Off | 0 |
| N/A 33C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 3 Tesla P100-SXM2... On | 00000000:0B:00.0 Off | 0 |
| N/A 34C P0 49W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 4 Tesla P100-SXM2... On | 00000000:85:00.0 Off | 0 |
| N/A 36C P0 50W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 5 Tesla P100-SXM2... On | 00000000:86:00.0 Off | 0 |
| N/A 33C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 6 Tesla P100-SXM2... On | 00000000:89:00.0 Off | 0 |
| N/A 38C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 7 Tesla P100-SXM2... On | 00000000:8A:00.0 Off | 0 |
| N/A 34C P0 49W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
Tue Jan 9 09:34:25 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81 Driver Version: 384.81 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla P100-SXM2... On | 00000000:06:00.0 Off | 0 |
| N/A 50C P0 196W / 300W | 15643MiB / 16276MiB | 97% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla P100-SXM2... On | 00000000:07:00.0 Off | 0 |
| N/A 41C P0 50W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 Tesla P100-SXM2... On | 00000000:0A:00.0 Off | 0 |
| N/A 33C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 3 Tesla P100-SXM2... On | 00000000:0B:00.0 Off | 0 |
| N/A 34C P0 49W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 4 Tesla P100-SXM2... On | 00000000:85:00.0 Off | 0 |
| N/A 36C P0 50W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 5 Tesla P100-SXM2... On | 00000000:86:00.0 Off | 0 |
| N/A 33C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 6 Tesla P100-SXM2... On | 00000000:89:00.0 Off | 0 |
| N/A 38C P0 48W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 7 Tesla P100-SXM2... On | 00000000:8A:00.0 Off | 0 |
| N/A 34C P0 49W / 300W | 15483MiB / 16276MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 44454 C /usr/bin/python 15631MiB |
| 1 44454 C /usr/bin/python 15471MiB |
| 2 44454 C /usr/bin/python 15471MiB |
| 3 44454 C /usr/bin/python 15471MiB |
+-----------------------------------------------------------------------------+
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
nvme0n8 259:7 0 375G 0 disk
nvme0n6 259:5 0 375G 0 disk
sdb 8:16 0 50G 0 disk
└─sdb1 8:17 0 50G 0 part /tmpfs
nvme0n4 259:3 0 375G 0 disk
nvme0n2 259:1 0 375G 0 disk
nvme0n7 259:6 0 375G 0 disk
nvme0n5 259:4 0 375G 0 disk
sda 8:0 0 100G 0 disk
└─sda1 8:1 0 100G 0 part /
nvme0n3 259:2 0 375G 0 disk
nvme0n1 259:0 0 375G 0 disk
"""Utility to manage the tpu version before starting the benchmark."""
import json
from absl import logging
from six.moves.urllib import request
try:
from cloud_tpu_client import client # pylint: disable=g-import-not-at-top
except ImportError:
print(
'Falling back to TensorFlow client; we recommended you install the Cloud '
'TPU client directly with pip install cloud-tpu-client.')
from tensorflow.python.tpu.client import client # pylint: disable=g-import-not-at-top
def _as_text(s):
"""Converts a byte/string into string."""
if isinstance(s, bytes):
return s.decode('utf-8')
return s
def _get_content(url):
"""Opens the url and loads the response into json."""
logging.info('opening url %s', url)
req = request.Request(url)
resp = request.urlopen(req)
resp_text = _as_text(resp.read())
logging.info('response text = %s', resp_text)
return json.loads(resp_text)
def _get_version_info(url, version_label):
"""Constructs a version info from the response."""
json_data = _get_content(url)
logging.info('json_data = %s', json_data)
if 'currentVersion' in json_data:
commit_id = json_data['currentVersion']
elif 'buildLabel' in json_data:
commit_id = json_data['buildLabel']
else:
commit_id = ''
info = {
'url': '',
'hash': commit_id,
'branch': version_label,
'piper_id': json_data.get('piperOriginRevId', '')
}
return info
def _configure_tpu_version(tpu_name, version_label, new_version_id):
"""Returns the current tpu version after resetting to an optional version."""
# The tpu_name is arbitrary / user chosen unique string for this tpu.
logging.info('Trying to connect to tpu %s', tpu_name)
tpu_client = client.Client(tpu=tpu_name)
tpu_client.wait_for_healthy()
if new_version_id:
logging.info('Trying to reset tpu version to %s', new_version_id)
tpu_client.configure_tpu_version(version=new_version_id)
tpu_client.wait_for_healthy()
logging.info('TPU healthy after version reset.')
else:
logging.info('Using the default tpu version id.')
workers = tpu_client.network_endpoints()
if workers:
ip_addr = workers[0]['ipAddress']
url = 'http://{}:8475/requestversion'.format(ip_addr)
return _get_version_info(url, version_label)
else:
logging.error('No tpu endpoint info')
return {
'url': '',
'hash': '',
'branch': version_label,
'piper_id': '',
}
def configure_tpu(tpu_params):
return _configure_tpu_version(
tpu_params.get('name'),
version_label=tpu_params.get('version'),
new_version_id=tpu_params.get('version_id'))
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""PerfZero utility methods."""
from __future__ import print_function
import importlib
import logging
import os
import shutil
import subprocess
import sys
import threading
import traceback
import requests
import json
import re
def create_empty_file(parent_directory, file_basename):
"""Creates an empty file with a given basename in a parent directory.
Creates parent_directory and intermediate directories if it doesn't exist.
This is mostly used for creating no-op actions in the Dockerfile.
Args:
parent_directory: The path to the parent directory.
file_basename: The basename for the empty file.
"""
if not os.path.isdir(parent_directory):
os.makedirs(parent_directory)
full_file_name = os.path.join(parent_directory, file_basename)
with open(full_file_name, 'w'):
print('Creating empty file: {}'.format(full_file_name))
def checkout_git_repos(git_repos, use_cached_site_packages):
"""Clone, update, or sync a repo.
Args:
git_repos: array of dict containing attributes of the git repo to checkout.
use_cached_site_packages: If true, skip git pull if git_repo already exists.
Returns:
A dict containing attributes of the git repositories
"""
site_package_info = {}
for repo in git_repos:
logging.info('Checking out repository from %s to %s',
repo['url'], repo['local_path'])
if not os.path.isdir(repo['local_path']):
run_commands(['git clone {} {}'.format(repo['url'], repo['local_path'])])
if 'branch' in repo:
run_commands(['git -C {} checkout {}'.format(
repo['local_path'], repo['branch'])])
if not use_cached_site_packages or 'git_hash' in repo:
run_commands(['git -C {} pull --rebase'.format(repo['local_path'])])
if 'git_hash' in repo:
run_commands(['git -C {} reset --hard {}'.format(
repo['local_path'], repo['git_hash'])])
logging.info('Checked-out repository from %s to %s',
repo['url'], repo['local_path'])
site_package_info[repo['dir_name']] = get_git_repo_info(repo['local_path'])
return site_package_info
def get_git_repo_info(local_path):
"""Get information of the git repository specified by the local_path."""
git_repo_info = {}
# Get git url
cmd = 'git -C {} config --get remote.origin.url'.format(local_path)
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
git_repo_info['url'] = lines[0]
else:
logging.error('Error getting git url for repository %s due to %s',
local_path, result)
return {}
# Get git branch
cmd = 'git -C {} rev-parse --abbrev-ref HEAD'.format(local_path)
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
git_repo_info['branch'] = lines[0]
else:
logging.error('Error getting git branch for repository %s due to %s',
local_path, result)
return {}
# Get git hash
cmd = 'git -C {} rev-parse HEAD'.format(local_path)
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
git_repo_info['hash'] = lines[0]
else:
logging.error('Error getting git hash for repository %s due to %s',
local_path, result)
return {}
return git_repo_info
def setup_python_path(site_packages_dir, python_path_str):
if python_path_str:
python_paths = python_path_str.split(',')
for python_path in python_paths:
logging.info('Adding path %s to sys.path', python_path)
sys.path.append(os.path.join(site_packages_dir, python_path))
logging.debug('PYTHONPATH: %s', sys.path)
def active_gcloud_service(gcloud_key_file_url, workspace_dir,
download_only=False):
"""Download key file and setup gcloud service credential using the key file.
Args:
gcloud_key_file_url: gcloud key file url
workspace_dir: directory that the key file is downloaded to
download_only: skip setting up the gcloud service credential if this is true
"""
if not gcloud_key_file_url:
return
local_path = os.path.join(workspace_dir,
os.path.basename(gcloud_key_file_url))
if not os.path.exists(local_path):
download_data([{'url': gcloud_key_file_url, 'local_path': local_path}])
if not download_only:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = local_path
run_commands(['gcloud auth activate-service-account --key-file {}'.format(
local_path)])
logging.info('Activated gcloud service account credential')
def setup_gsutil_credential():
run_commands(['gcloud config set pass_credentials_to_gsutil true'])
def download_data(download_infos):
"""Download data from url to local_path for each (url, local_path) pair in the download_infos.
Each url should start with either gs://, http:// or https://
Downloaded file whose name ends with .gz will be decompressed in its
current directory
Args:
download_infos: array of dict which specifies the url and local_path for
data download
"""
for info in download_infos:
if os.path.exists(info['local_path']):
continue
original_base_name = os.path.basename(info['url'])
expected_base_name = os.path.basename(info['local_path'])
local_path_parent = os.path.dirname(info['local_path'])
logging.info('Downloading data from %s to %s',
info['url'], info['local_path'])
make_dir_if_not_exist(local_path_parent)
# Download data to the local path
if info['url'].startswith('http://') or info['url'].startswith('https://'):
request = requests.get(info['url'], allow_redirects=True)
f = open(info['local_path'], 'wb')
f.write(request.content)
f.close()
elif info['url'].startswith('gs://'):
cmd = ['gsutil', '-m', 'cp', '-r', '-n', info['url'], local_path_parent]
run_commands([cmd], shell=False)
elif info['url'].startswith('file://'):
cmd = ['cp', info['url'][7:], local_path_parent]
run_commands([cmd], shell=False)
else:
raise ValueError('Url {} with prefix {} is not supported.'.format(
info['url'], info['url'].split(':')[0]))
# Move data to the expected local path
if original_base_name != expected_base_name:
run_commands(['mv {} {}'.format(
os.path.join(local_path_parent, original_base_name),
os.path.join(local_path_parent, expected_base_name))])
logging.info('Downloaded data from %s to %s',
info['url'], info['local_path'])
# Decompress file if file name ends with .gz unless caller sets 'decompress'
# to False in info.
if info['url'].endswith('.gz') and info.get('decompress', True):
run_commands(['tar xvf {} -C {}'.format(
info['local_path'], local_path_parent)])
logging.info('Decompressed file %s', info['local_path'])
def parse_data_downloads_str(root_data_dir, data_downloads_str):
"""Parse a comma separated string into array of dicts.
Each dict specifies the url and local_path for a download.
Args:
root_data_dir: the directory which should contain all the dataset files
data_downloads_str: a comma separated string specified by the
flag --data_downloads
Returns:
An array of dict which specifies the url and local_path for data download
"""
download_infos = []
if not data_downloads_str:
return download_infos
for entry in data_downloads_str.split(','):
info = {}
if ';' in entry:
info['url'] = entry.split(';')[0]
info['local_path'] = os.path.join(root_data_dir, entry.split(';')[1])
else:
info['url'] = entry
info['local_path'] = os.path.join(root_data_dir, os.path.basename(entry))
# Canonicalize url to remove trailing '/' and '*'
if info['url'].endswith('*'):
info['url'] = info['url'][:-1]
if info['url'].endswith('/'):
info['url'] = info['url'][:-1]
download_infos.append(info)
return download_infos
def maybe_upload_to_gcs(local_dir, output_gcs_url):
if not output_gcs_url:
return
run_commands(['gsutil -m cp -r {} {}'.format(local_dir, output_gcs_url)])
logging.info('Uploaded data from local directory %s to gcs %s',
local_dir, output_gcs_url)
def make_dir_if_not_exist(local_path):
if not os.path.exists(local_path):
os.makedirs(local_path)
logging.info('Created directory %s', local_path)
def run_command(cmd, shell=True):
"""Structures for a variety of different test results.
Args:
cmd: Command to execute
shell: True to use shell, false otherwise.
Returns:
Tuple of the command return value and the standard out in as a string.
"""
logging.debug('Executing command: %s', cmd)
p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, shell=shell)
exit_code = None
line = ''
stdout = ''
while exit_code is None or line:
exit_code = p.poll()
line = p.stdout.readline().decode('utf-8')
stdout += line
logging.debug(line)
return exit_code, stdout
def run_commands(cmds, shell=True):
"""Runs list of command and throw error if any fail."""
for cmd in cmds:
exit_code, stdout = run_command(cmd, shell=shell)
if exit_code:
raise Exception('"{}" failed with code:{} and stdout:\n{}'.format(
cmd, exit_code, stdout))
def get_cpu_name():
cmd = "cat /proc/cpuinfo | grep 'model name' | sort --unique"
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
model_name_parts = lines[0].split(':')
return model_name_parts[1].strip()
else:
logging.error('Error getting cpuinfo model name: %s', result)
return ''
def get_cpu_socket_count():
cmd = 'grep -i "physical id" /proc/cpuinfo | sort -u | wc -l'
exit_code, result = run_command(cmd)
lines = result.splitlines()
if exit_code == 0 and lines:
return int(lines[0])
else:
logging.error('Error getting cpuinfo scocket count: %s', result)
return -1
def _get_amd_gpu_info():
"""Returns gpu information using rocm-smi.
Note: Assumes if the system has multiple GPUs, that they are all the same
Returns:
A dict containing gpu_driver_version, gpu_model and gpu_count or None if
`rocm-smi` is not found or fails.
"""
cmd = 'rocm-smi --json --showproductname --showdriverversion'
exit_code, result = run_command(cmd)
if exit_code != 0:
logging.error('rocm-smi did not return as expected: %s', result)
return None
def get_gpu_driver_version(rocm_smi_output):
return rocm_smi_output['system']['Driver version']
def get_gpu_model(rocm_smi_output):
gpu_model = ""
for key, value in rocm_smi_output.items():
if re.match("card[0-9]+", key):
gpu_model = value['Card SKU']
break
return gpu_model
def get_gpu_count(rocm_smi_output):
gpu_count = 0
for key, value in rocm_smi_output.items():
if re.match("card[0-9]+", key):
gpu_count += 1
return gpu_count
rocm_smi_output= json.loads(result)
gpu_info = {}
gpu_info['gpu_driver_version'] = get_gpu_driver_version(rocm_smi_output)
gpu_info['gpu_model'] = get_gpu_model(rocm_smi_output)
gpu_info['gpu_count'] = get_gpu_count(rocm_smi_output)
return gpu_info
def _get_nvidia_gpu_info():
"""Returns gpu information using nvidia-smi.
Note: Assumes if the system has multiple GPUs that they are all the same with
one exception. If the first result is a Quadro, the heuristic assumes
this may be a workstation and takes the second entry.
Returns:
A dict containing gpu_driver_version, gpu_model and gpu_count or None if
`nvidia-smi` is not found or fails.
"""
cmd = 'nvidia-smi --query-gpu=driver_version,gpu_name --format=csv'
exit_code, result = run_command(cmd)
if exit_code != 0:
logging.error('nvidia-smi did not return as expected: %s', result)
return None
lines = result.splitlines()
gpu_info_line = lines[1]
if 'Quadro' in gpu_info_line and len(lines) >= 3:
gpu_info_line = lines[2]
gpu_info = {}
gpu_info['gpu_driver_version'] = gpu_info_line.split(',')[0].strip()
gpu_info['gpu_model'] = gpu_info_line.split(',')[1].strip()
gpu_info['gpu_count'] = len(lines) - 1
return gpu_info
def get_gpu_info():
"""Returns gpu information using either nvidia-smi or rocm-smi.
Returns:
A dict containing gpu_driver_version, gpu_model and gpu_count or None if
`nvidia-smi` is not found or fails.
"""
return _get_amd_gpu_info() if shutil.which("rocm-smi") \
else _get_nvidia_gpu_info()
def _install_tpu_tool():
"""Installs the ctpu tool to managing cloud TPUs.
Follows the instructions here:
https://github.com/tensorflow/tpu/tree/master/tools/ctpu
"""
if not os.path.exists('ctpu'):
logging.info('Installing TPU tool')
commands = [
'wget https://dl.google.com/cloud_tpu/ctpu/latest/linux/ctpu',
'chmod a+x ctpu',
]
run_commands(commands)
def setup_tpu(parameters):
"""Sets up a TPU with a given set of parameters.
Args:
parameters: dictionary of TPU parameters.
Returns:
True if an error occurs during setup.
"""
try:
_install_tpu_tool()
args = [
'--name={}'.format(parameters.get('name')),
'--project={}'.format(parameters.get('project')),
'--zone={}'.format(parameters.get('zone')),
'--tpu-size={}'.format(parameters.get('size')),
'--tf-version={}'.format(parameters.get('version')),
'--tpu-only',
'-noconf',
]
command = './ctpu up {}'.format(' '.join(args))
logging.info('Setting up TPU: %s', command)
exit_code, output = run_command(command)
if exit_code != 0:
logging.error('Error in setup with output: %s', output)
return exit_code != 0
except Exception:
logging.error('Unable to setup TPU')
run_command('rm -f ctpu')
sys.exit(1)
def cleanup_tpu(parameters):
"""Cleans up an existing TPU.
Args:
parameters: dictionary of TPU parameters.
Returns:
True if an error occurs during cleanup.
"""
_install_tpu_tool()
args = [
'--name={}'.format(parameters.get('name')),
'--project={}'.format(parameters.get('project')),
'--zone={}'.format(parameters.get('zone')),
'--tpu-only',
'-noconf',
]
command = './ctpu delete {}'.format(' '.join(args))
logging.info('Cleaning up TPU: %s', command)
exit_code, output = run_command(command)
if exit_code != 0:
logging.error('Error in cleanup with output: %s', output)
return exit_code != 0
def read_benchmark_result(benchmark_result_file_path):
"""Read benchmark result from the protobuf file."""
from google.protobuf import json_format # pylint: disable=g-import-not-at-top
from tensorflow.core.util import test_log_pb2 # pylint: disable=g-import-not-at-top
if not os.path.isfile(benchmark_result_file_path):
logging.error('Failed to read benchmark result because '
'file %s does not exist', benchmark_result_file_path)
return {}
with open(benchmark_result_file_path, 'rb') as f:
benchmark_entries = test_log_pb2.BenchmarkEntries()
benchmark_entries.ParseFromString(f.read())
return json_format.MessageToDict(
benchmark_entries,
preserving_proto_field_name=True,
including_default_value_fields=True)['entry'][0]
def print_thread_stacktrace():
print('Here is the stacktrace for all threads:')
thread_names = {t.ident: t.name for t in threading.enumerate()}
for thread_id, frame in sys._current_frames().items(): # pylint: disable=protected-access
print('Thread {}'.format(thread_names.get(thread_id, thread_id)))
traceback.print_stack(frame)
def instantiate_benchmark_class(
benchmark_class, output_dir, root_data_dir, tpu, constructor_args,
benchmark_class_type=None):
"""Return initialized benchmark class."""
module_import_path, class_name = benchmark_class.rsplit('.', 1)
module = importlib.import_module(module_import_path)
class_ = getattr(module, class_name)
if benchmark_class_type == 'tf_benchmark':
# for benchmarks inheriting from tf.test.Benchmark, instantiate them directly.
instance = class_(**constructor_args)
else:
# Default instantiation for perfzero_benchmark classes.
instance = class_(
output_dir=output_dir,
root_data_dir=root_data_dir,
tpu=tpu,
**constructor_args)
return instance
def copy_and_rename_dirs(dir_spec_string, dst_base_dir):
"""Copies list of <dir-path>:new_name specs into a new dest dir.
If a path /path1/path2/dir:new_dir is given, it copies /path1/path2/dir to
dst_base_dir/new_dir.
Args:
dir_spec_string: Comma separated list of /path1/path2:new_name specs.
dst_base_dir: The base dir to contain the copies.
"""
if not dir_spec_string:
return
dir_specs = dir_spec_string.split(',')
for src_dir_with_name in dir_specs:
src_dir, final_basename = src_dir_with_name.split(':')
dst_dir = os.path.join(dst_base_dir, final_basename)
if os.path.isdir(dst_dir):
logging.info('[DELETE] pre-existing %s', dst_dir)
shutil.rmtree(dst_dir)
logging.info('[COPY] %s -> %s', src_dir, dst_dir)
shutil.copytree(src_dir, dst_dir)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests utils.py."""
import os
import unittest
from mock import call
from mock import MagicMock
from mock import patch
import perfzero.utils as utils
import tensorflow as tf # pylint: disable=g-bad-import-order
class TestUtils(unittest.TestCase, tf.test.Benchmark):
def test_protobuf_read(self):
output_dir = '/tmp/'
os.environ['TEST_REPORT_FILE_PREFIX'] = output_dir
benchmark_result_file_path = os.path.join(output_dir,
'TestUtils.testReportBenchmark')
if os.path.exists(benchmark_result_file_path):
os.remove(benchmark_result_file_path)
self.report_benchmark(
iters=2000,
wall_time=1000,
name='testReportBenchmark',
metrics=[{'name': 'metric_name_1', 'value': 0, 'min_value': 1},
{'name': 'metric_name_2', 'value': 90, 'min_value': 0,
'max_value': 95}])
actual_result = utils.read_benchmark_result(
benchmark_result_file_path)
os.remove(benchmark_result_file_path)
expected_result = {
'name': 'TestUtils.testReportBenchmark',
# google.protobuf.json_format.MessageToDict() will convert
# int64 field to string.
'iters': '2000',
'wall_time': 1000,
'cpu_time': 0,
'throughput': 0,
'extras': {},
'metrics': [
{
'name': 'metric_name_1',
'value': 0,
'min_value': 1
},
{
'name': 'metric_name_2',
'value': 90,
'min_value': 0,
'max_value': 95
}
]
}
self.assertDictEqual(expected_result, actual_result)
@patch('perfzero.utils.get_git_repo_info')
@patch('perfzero.utils.run_commands')
def test_checkout_git_repos(self, run_commands_mock, get_git_repo_info_mock):
git_repo_1 = {}
git_repo_1['url'] = 'url_1'
git_repo_1['local_path'] = 'local_path_1'
git_repo_1['dir_name'] = 'dir_name_1'
git_repo_1['branch'] = 'branch_1'
git_repo_1['git_hash'] = 'git_hash_1'
git_repo_2 = {}
git_repo_2['url'] = 'url_2'
git_repo_2['local_path'] = 'local_path_2'
git_repo_2['dir_name'] = 'dir_name_2'
git_repo_2['branch'] = 'branch_2'
git_repo_info_1 = {'url': 'url_1'}
git_repo_info_2 = {'url': 'url_2'}
get_git_repo_info_mock.side_effect = \
lambda local_path: git_repo_info_1 if local_path == 'local_path_1' else git_repo_info_2 # pylint: disable=line-too-long
site_package_info = utils.checkout_git_repos([git_repo_1, git_repo_2],
False)
self.assertEqual(2, len(site_package_info))
self.assertEqual(git_repo_info_1, site_package_info['dir_name_1'])
self.assertEqual(git_repo_info_2, site_package_info['dir_name_2'])
run_commands_mock.assert_has_calls(any_order=False, calls=[
call(['git clone url_1 local_path_1']),
call(['git -C local_path_1 checkout branch_1']),
call(['git -C local_path_1 pull --rebase']),
call(['git -C local_path_1 reset --hard git_hash_1']),
call(['git clone url_2 local_path_2']),
call(['git -C local_path_2 checkout branch_2'])
])
@patch('perfzero.utils.run_command')
def test_get_git_repo_info(self, run_command_mock):
run_command_mock.side_effect = [
[0, 'git_url'],
[0, 'branch_name'],
[0, 'git_hash']
]
git_repo_info = utils.get_git_repo_info('local_path_1')
self.assertEqual(
{'url': 'git_url', 'branch': 'branch_name', 'hash': 'git_hash'},
git_repo_info)
run_command_mock.assert_has_calls(any_order=False, calls=[
call('git -C local_path_1 config --get remote.origin.url'),
call('git -C local_path_1 rev-parse --abbrev-ref HEAD'),
call('git -C local_path_1 rev-parse HEAD')
])
@patch('builtins.open')
@patch('perfzero.utils.make_dir_if_not_exist')
@patch('requests.get')
@patch('perfzero.utils.run_commands')
def test_download_data(self, run_commands_mock, requests_get_mock,
make_dir_mock, open_mock): # pylint: disable=unused-argument
get_mock = MagicMock()
get_mock.content = 'content'
requests_get_mock.return_value = get_mock
download_info_1 = {'url': 'gs://remote_path_1/name_1',
'local_path': 'local_path_1/modified_name_1'}
download_info_2 = {'url': 'http://remote_path_2/name_2',
'local_path': 'local_path_2/modified_name_2'}
utils.download_data([download_info_1, download_info_2])
make_dir_mock.assert_has_calls(any_order=False, calls=[
call('local_path_1'),
call('local_path_2')
])
requests_get_mock.assert_called_once_with('http://remote_path_2/name_2',
allow_redirects=True)
run_commands_mock.assert_has_calls(any_order=False, calls=[
call([['gsutil', '-m', 'cp', '-r', '-n',
'gs://remote_path_1/name_1', 'local_path_1']],
shell=False),
call(['mv local_path_1/name_1 local_path_1/modified_name_1']),
call(['mv local_path_2/name_2 local_path_2/modified_name_2'])
])
def test_parse_data_downloads_str(self):
data_downloads_str = 'url_1;relative_path_1,url_2;relative_path_2'
download_infos = utils.parse_data_downloads_str('/root_data_dir',
data_downloads_str)
self.assertEqual(2, len(download_infos))
self.assertEqual(download_infos[0],
{'url': 'url_1',
'local_path': '/root_data_dir/relative_path_1'})
self.assertEqual(download_infos[1],
{'url': 'url_2',
'local_path': '/root_data_dir/relative_path_2'})
@patch('perfzero.utils.run_command')
def test_get_cpu_name(self, run_command_mock):
"""Tests extract the cpu model name."""
run_command_mock.return_value = [
0, 'model name : Intel(R) Xeon(R) CPU E5-1650 v2 @ 3.50GHz\n'
]
cpu_name = utils.get_cpu_name()
self.assertEqual('Intel(R) Xeon(R) CPU E5-1650 v2 @ 3.50GHz', cpu_name)
@patch('perfzero.utils.run_command')
def test_get_cpu_socket_count(self, run_command_mock):
"""Tests get socket count."""
run_command_mock.return_value = [0, '2\n']
cpu_socket_count = utils.get_cpu_socket_count()
self.assertEqual(2, cpu_socket_count)
@patch('perfzero.utils.run_command')
def test_get_gpu_model(self, run_command_mock):
# Tests get gpu info parses expected value into expected components.
run_command_mock.return_value = [
0, 'driver_version, name\n381.99, GTX 1080 \n'
]
gpu_model = utils.get_gpu_info()['gpu_model']
self.assertEqual('GTX 1080', gpu_model)
# Tests gpu info returns second entry if first entry is a Quadro.
run_command_mock.return_value = [
0, 'blah\n200.99, Quadro K900 \n381.99, GTX 1080\n'
]
gpu_model = utils.get_gpu_info()['gpu_model']
self.assertEqual('GTX 1080', gpu_model)
@patch('perfzero.utils.run_command')
def test_get_gpu_count(self, run_command_mock):
"""Tests gpu info returns second entry if first entry is a Quadro."""
run_command_mock.return_value = [
0, 'blah\n200.99, Quadro K900 \n381.99, GTX 1080\n'
]
gpu_count = utils.get_gpu_info()['gpu_count']
self.assertEqual(2, gpu_count)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Checkout repository, download data and build docker image."""
from __future__ import print_function
import argparse
import json
import logging
import os
import shutil
import sys
import tempfile
import time
import perfzero.device_utils as device_utils
import perfzero.perfzero_config as perfzero_config
import perfzero.utils as utils
def _temporary_file_name(parent_dir, base_name):
"""Returns a temp name of the form <parent-dir>/<random>/<base-name>."""
if not os.path.isdir(parent_dir):
os.makedirs(parent_dir)
temp_dir = tempfile.mkdtemp(dir=parent_dir)
return os.path.join(temp_dir, base_name)
def _load_docker_image(FLAGS, workspace_dir, setup_execution_time):
"""Runs docker load --input_image <FLAGS.dockerfile_path>.
Fetches FLAGS.dockerfile_path to workspace_dir/<temp-dir>/local_docker first.
Runs docker load --input <path-to-local-docker>.
Deletes workspace_dir/<temp-dir> after the docker image is loaded.
Args:
FLAGS: parser.parse_known_args object.
workspace_dir: String - The path to use for intermediate artifacts.
setup_execution_time: Map from string->double containing wall times for
different operations. This will have insertions describing the docker
setup time.
"""
load_docker_start_time = time.time()
local_docker_image_path = _temporary_file_name(workspace_dir, 'local_docker')
utils.download_data([{'url': FLAGS.dockerfile_path,
'local_path': local_docker_image_path,
'decompress': False}])
setup_execution_time['fetch_docker'] = time.time() - load_docker_start_time
docker_load_cmd = 'docker load --input {}'.format(local_docker_image_path)
try:
utils.run_commands(
[docker_load_cmd,
'docker images' # Print loaded image list.
])
setup_execution_time['load_docker'] = time.time() - load_docker_start_time
finally:
logging.info('removing parent dir of local docker image copy %s',
local_docker_image_path)
shutil.rmtree(os.path.dirname(local_docker_image_path))
def _create_docker_image(FLAGS, project_dir, workspace_dir,
setup_execution_time):
"""Creates a docker image.
Args:
FLAGS: parser.parse_known_args object.
project_dir: String - The current project path.
workspace_dir: String - The path to use for intermediate artifacts.
setup_execution_time: Map from string->double containing wall times for
different operations. This will have insertions describing the docker
setup time.
"""
# Create docker image
docker_start_time = time.time()
docker_context = os.path.join(workspace_dir, 'resources')
# Necessary in case we don't have a local .whl file.
utils.create_empty_file(docker_context, 'EMPTY')
# Download TensorFlow pip package from Google Cloud Storage and modify package
# path accordingly, if applicable
local_tensorflow_pip_spec = None
if (FLAGS.tensorflow_pip_spec and
(FLAGS.tensorflow_pip_spec.startswith('gs://') or
FLAGS.tensorflow_pip_spec.startswith('file://'))):
local_pip_filename = os.path.basename(FLAGS.tensorflow_pip_spec)
local_pip_path = os.path.join(docker_context, local_pip_filename)
utils.download_data([{'url': FLAGS.tensorflow_pip_spec,
'local_path': local_pip_path}])
# Update path to pip wheel file for the Dockerfile. Note that this path has
# to be relative to the docker context (absolute path will not work).
FLAGS.tensorflow_pip_spec = local_pip_filename
local_tensorflow_pip_spec = local_pip_filename
else:
local_tensorflow_pip_spec = 'EMPTY'
dockerfile_path = FLAGS.dockerfile_path
if not os.path.exists(dockerfile_path):
# Fall back to the deprecated approach if the user-specified
# dockerfile_path does not exist
dockerfile_path = os.path.join(project_dir, FLAGS.dockerfile_path)
extra_pip_specs = (FLAGS.extra_pip_specs or '').replace(';', '')
docker_base_cmd = 'docker build --no-cache --pull'
# FLAGS.extra_docker_build_args will be a list of strings (e.g. ['a', 'b=c']).
# We treat the strings directly as build-args: --build-arg a --build-arg b=c
# Empty strings are ignored.
extra_docker_build_args = ' '.join([
'--build-arg %s' % arg for arg in FLAGS.extra_docker_build_args if arg])
cmd = '{docker_base_cmd} -t {docker_tag}{tf_pip}{local_tf_pip}{extra_pip}{extra_docker_build_args} {suffix}'.format(
docker_base_cmd=docker_base_cmd,
docker_tag=FLAGS.docker_tag,
tf_pip=(
' --build-arg tensorflow_pip_spec={}'.format(
FLAGS.tensorflow_pip_spec) if FLAGS.tensorflow_pip_spec else ''),
# local_tensorflow_pip_spec is either string 'EMPTY' or basename of
# local .whl file.
local_tf_pip=' --build-arg local_tensorflow_pip_spec={}'.format(
local_tensorflow_pip_spec),
extra_pip=' --build-arg extra_pip_specs=\'{}\''.format(extra_pip_specs),
extra_docker_build_args=' ' + extra_docker_build_args,
suffix=(
'-f {} {}'.format(dockerfile_path, docker_context)
if docker_context else '- < {}'.format(dockerfile_path))
)
utils.run_commands([cmd])
logging.info('Built docker image with tag %s', FLAGS.docker_tag)
setup_execution_time['build_docker'] = time.time() - docker_start_time
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
perfzero_config.add_setup_parser_arguments(parser)
FLAGS, unparsed = parser.parse_known_args()
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
level=logging.DEBUG)
if unparsed:
logging.error('Arguments %s are not recognized', unparsed)
sys.exit(1)
setup_execution_time = {}
project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
workspace_dir = os.path.join(project_dir, FLAGS.workspace)
site_package_dir = os.path.join(workspace_dir, 'site-packages')
utils.copy_and_rename_dirs(FLAGS.site_package_downloads,
site_package_dir)
activate_gcloud = False
if FLAGS.dockerfile_path and FLAGS.dockerfile_path.startswith('gs://'):
# We might end up doing gsutil fetch later, so need to call
# active_gcloud_service().
activate_gcloud = True
if FLAGS.tensorflow_pip_spec and FLAGS.tensorflow_pip_spec.startswith('gs://'):
activate_gcloud = True
# Download gcloud auth token. Remove this operation in the future when
# docker in Kokoro can accesss the GCP metadata server
start_time = time.time()
utils.active_gcloud_service(FLAGS.gcloud_key_file_url,
workspace_dir, download_only=not activate_gcloud)
setup_execution_time['download_token'] = time.time() - start_time
# Set up the raid array.
start_time = time.time()
device_utils.create_drive_from_devices(FLAGS.root_data_dir,
FLAGS.gce_nvme_raid)
setup_execution_time['create_drive'] = time.time() - start_time
if FLAGS.dockerfile_path:
if FLAGS.dockerfile_path.endswith('.tar.gz'):
logging.info('Assuming given file %s is a docker image to load',
FLAGS.dockerfile_path)
_load_docker_image(FLAGS, workspace_dir,
setup_execution_time)
else:
_create_docker_image(FLAGS, project_dir, workspace_dir,
setup_execution_time)
logging.info('Setup time in seconds by operation:\n %s',
json.dumps(setup_execution_time, indent=2))
[
{
"name": "execution_timestamp",
"type": "TIMESTAMP",
"mode": "REQUIRED"
},
{
"name": "execution_id",
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "ml_framework_info",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "benchmark_result",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "benchmark_info",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "setup_info",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "system_info",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "process_info",
"type": "STRING",
"mode": "NULLABLE"
}
]
#!/usr/bin/env bash
#
# Steps:
#
# 1. Download corresponding html file for some README.md:
# curl -s $1
#
# 2. Discard rows where no substring 'user-content-' (github's markup):
# awk '/user-content-/ { ...
#
# 3.1 Get last number in each row like ' ... </span></a>sitemap.js</h1'.
# It's a level of the current header:
# substr($0, length($0), 1)
#
# 3.2 Get level from 3.1 and insert corresponding number of spaces before '*':
# sprintf("%*s", substr($0, length($0), 1)*3, " ")
#
# 4. Find head's text and insert it inside "* [ ... ]":
# substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
#
# 5. Find anchor and insert it inside "(...)":
# substr($0, match($0, "href=\"[^\"]+?\" ")+6, RLENGTH-8)
#
gh_toc_version="0.6.0"
gh_user_agent="gh-md-toc v$gh_toc_version"
#
# Download rendered into html README.md by its url.
#
#
gh_toc_load() {
local gh_url=$1
if type curl &>/dev/null; then
curl --user-agent "$gh_user_agent" -s "$gh_url"
elif type wget &>/dev/null; then
wget --user-agent="$gh_user_agent" -qO- "$gh_url"
else
echo "Please, install 'curl' or 'wget' and try again."
exit 1
fi
}
#
# Converts local md file into html by GitHub
#
# ➥ curl -X POST --data '{"text": "Hello world github/linguist#1 **cool**, and #1!"}' https://api.github.com/markdown
# <p>Hello world github/linguist#1 <strong>cool</strong>, and #1!</p>'"
gh_toc_md2html() {
local gh_file_md=$1
URL=https://api.github.com/markdown/raw
if [ -z "$GH_TOC_TOKEN" ]; then
TOKEN=$GH_TOC_TOKEN
else
TOKEN="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt"
fi
if [ -f "$TOKEN" ]; then
URL="$URL?access_token=$(cat $TOKEN)"
fi
# echo $URL 1>&2
OUTPUT="$(curl -s --user-agent "$gh_user_agent" \
--data-binary @"$gh_file_md" -H "Content-Type:text/plain" \
$URL)"
if [ "$?" != "0" ]; then
echo "XXNetworkErrorXX"
fi
if [ "$(echo "${OUTPUT}" | awk '/API rate limit exceeded/')" != "" ]; then
echo "XXRateLimitXX"
else
echo "${OUTPUT}"
fi
}
#
# Is passed string url
#
gh_is_url() {
case $1 in
https* | http*)
echo "yes";;
*)
echo "no";;
esac
}
#
# TOC generator
#
gh_toc(){
local gh_src=$1
local gh_src_copy=$1
local gh_ttl_docs=$2
local need_replace=$3
if [ "$gh_src" = "" ]; then
echo "Please, enter URL or local path for a README.md"
exit 1
fi
# Show "TOC" string only if working with one document
if [ "$gh_ttl_docs" = "1" ]; then
echo "Table of Contents"
echo "================="
echo ""
gh_src_copy=""
fi
if [ "$(gh_is_url "$gh_src")" == "yes" ]; then
gh_toc_load "$gh_src" | gh_toc_grab "$gh_src_copy"
if [ "${PIPESTATUS[0]}" != "0" ]; then
echo "Could not load remote document."
echo "Please check your url or network connectivity"
exit 1
fi
if [ "$need_replace" = "yes" ]; then
echo
echo "!! '$gh_src' is not a local file"
echo "!! Can't insert the TOC into it."
echo
fi
else
local rawhtml=$(gh_toc_md2html "$gh_src")
if [ "$rawhtml" == "XXNetworkErrorXX" ]; then
echo "Parsing local markdown file requires access to github API"
echo "Please make sure curl is installed and check your network connectivity"
exit 1
fi
if [ "$rawhtml" == "XXRateLimitXX" ]; then
echo "Parsing local markdown file requires access to github API"
echo "Error: You exceeded the hourly limit. See: https://developer.github.com/v3/#rate-limiting"
TOKEN="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt"
echo "or place github auth token here: $TOKEN"
exit 1
fi
local toc=`echo "$rawhtml" | gh_toc_grab "$gh_src_copy"`
echo "$toc"
if [ "$need_replace" = "yes" ]; then
local ts="<\!--ts-->"
local te="<\!--te-->"
local dt=`date +'%F_%H%M%S'`
local ext=".orig.${dt}"
local toc_path="${gh_src}.toc.${dt}"
local toc_footer="<!-- Added by: `whoami`, at: `date --iso-8601='minutes'` -->"
# http://fahdshariff.blogspot.ru/2012/12/sed-mutli-line-replacement-between-two.html
# clear old TOC
sed -i${ext} "/${ts}/,/${te}/{//!d;}" "$gh_src"
# create toc file
echo "${toc}" > "${toc_path}"
echo -e "\n${toc_footer}\n" >> "$toc_path"
# insert toc file
if [[ "`uname`" == "Darwin" ]]; then
sed -i "" "/${ts}/r ${toc_path}" "$gh_src"
else
sed -i "/${ts}/r ${toc_path}" "$gh_src"
fi
echo
echo "!! TOC was added into: '$gh_src'"
echo "!! Origin version of the file: '${gh_src}${ext}'"
echo "!! TOC added into a separate file: '${toc_path}'"
echo
fi
fi
}
#
# Grabber of the TOC from rendered html
#
# $1 — a source url of document.
# It's need if TOC is generated for multiple documents.
#
gh_toc_grab() {
# if closed <h[1-6]> is on the new line, then move it on the prev line
# for example:
# was: The command <code>foo1</code>
# </h1>
# became: The command <code>foo1</code></h1>
sed -e ':a' -e 'N' -e '$!ba' -e 's/\n<\/h/<\/h/g' |
# find strings that corresponds to template
grep -E -o '<a.*id="user-content-[^"]*".*</h[1-6]' |
# remove code tags
sed 's/<code>//g' | sed 's/<\/code>//g' |
# now all rows are like:
# <a id="user-content-..." href="..."><span ...></span></a> ... </h1
# format result line
# * $0 — whole string
# * last element of each row: "</hN" where N in (1,2,3,...)
echo -e "$(awk -v "gh_url=$1" '{
level = substr($0, length($0), 1)
text = substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
href = substr($0, match($0, "href=\"[^\"]+?\"")+6, RLENGTH-7)
print sprintf("%*s", level*3, " ") "* [" text "](" gh_url href ")" }' |
sed 'y/+/ /; s/%/\\x/g')"
}
#
# Returns filename only from full path or url
#
gh_toc_get_filename() {
echo "${1##*/}"
}
#
# Options hendlers
#
gh_toc_app() {
local app_name=$(basename $0)
local need_replace="no"
if [ "$1" = '--help' ] || [ $# -eq 0 ] ; then
echo "GitHub TOC generator ($app_name): $gh_toc_version"
echo ""
echo "Usage:"
echo " $app_name [--insert] src [src] Create TOC for a README file (url or local path)"
echo " $app_name - Create TOC for markdown from STDIN"
echo " $app_name --help Show help"
echo " $app_name --version Show version"
return
fi
if [ "$1" = '--version' ]; then
echo "$gh_toc_version"
echo
echo "os: `lsb_release -d | cut -f 2`"
echo "kernel: `cat /proc/version`"
echo "shell: `$SHELL --version`"
echo
for tool in curl wget grep awk sed; do
printf "%-5s: " $tool
echo `$tool --version | head -n 1`
done
return
fi
if [ "$1" = "-" ]; then
if [ -z "$TMPDIR" ]; then
TMPDIR="/tmp"
elif [ -n "$TMPDIR" -a ! -d "$TMPDIR" ]; then
mkdir -p "$TMPDIR"
fi
local gh_tmp_md
gh_tmp_md=$(mktemp $TMPDIR/tmp.XXXXXX)
while read input; do
echo "$input" >> "$gh_tmp_md"
done
gh_toc_md2html "$gh_tmp_md" | gh_toc_grab ""
return
fi
if [ "$1" = '--insert' ]; then
need_replace="yes"
shift
fi
for md in "$@"
do
echo ""
gh_toc "$md" "$#" "$need_replace"
done
echo ""
echo "Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)"
}
#
# Entry point
#
gh_toc_app "$@"
#!/usr/bin/python
#
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plot graph showing process metric values over time"""
from __future__ import print_function
import argparse
import sys
import json
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf as backend_pdf
import matplotlib.ticker as tick
colors=['b', 'r', 'g', 'c', 'pink']
def visualize(file_path):
entries = []
with open(file_path) as f:
entries = [json.loads(line) for line in f.readlines() if line.strip()]
if not entries:
print('There is no data in file {}'.format(file_path))
return
pdf = backend_pdf.PdfPages("process_info.pdf")
idx = 0
names = [name for name in entries[0].keys() if name != 'time']
times = [entry['time'] for entry in entries]
for name in names:
values = [entry[name] for entry in entries]
fig = plt.figure()
ax = plt.gca()
ax.yaxis.set_major_formatter(tick.ScalarFormatter(useMathText=True))
plt.ticklabel_format(style='sci', axis='y', scilimits=(-2,3))
plt.plot(times, values, colors[idx % len(colors)], marker='x', label=name)
plt.xlabel('Time (sec)')
plt.ylabel(name)
plt.ylim(ymin=0)
plt.legend(loc = 'upper left')
pdf.savefig(fig)
idx += 1
plt.show()
pdf.close()
print('Generated process_info.pdf from {}'.format(file_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser(usage='plot_process_info.py <path_to_file>' )
parser.add_argument('file_path', type=str)
flags = parser.parse_args(sys.argv[1:])
visualize(flags.file_path)
# tf_cnn_benchmarks: High performance benchmarks
**Note: tf_cnn_benchmarks is no longer maintained.**
tf_cnn_benchmarks contains TensorFlow 1 implementations of several popular
convolutional models, and is designed to be as fast as possible.
tf_cnn_benchmarks supports both running on a single machine or running in
distributed mode across multiple hosts.
tf_cnn_benchmarks is no longer maintained. Although it will run with TensorFlow
2, it was written and optimized for TensorFlow 1, and has not been maintained
since TensorFlow 2 was released. For clean and easy-to-read TensorFlow 2 models,
please see the [TensorFlow Official
Models](https://github.com/tensorflow/models/tree/master/official).
## Getting Started
To run ResNet50 with synthetic data without distortions with a single GPU, run
```
python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
```
Note that the master branch of tf_cnn_benchmarks occasionally requires the
latest nightly version of TensorFlow. You can install the nightly version by
running `pip install tf-nightly-gpu` in a clean environment, or by installing
TensorFlow from source. We sometimes will create a branch of tf_cnn_benchmarks,
in the form of cnn_tf_vX.Y_compatible, that is compatible with TensorFlow
version X.Y. For example, branch
[cnn_tf_v1.9_compatible](https://github.com/tensorflow/benchmarks/tree/cnn_tf_v1.9_compatible/scripts/tf_cnn_benchmarks)
works with TensorFlow 1.9. However, as tf_cnn_benchmarks is no longer
maintained, we will likely no longer create new branches.
Some important flags are
* model: Model to use, e.g. resnet50, inception3, vgg16, and alexnet.
* num_gpus: Number of GPUs to use.
* data_dir: Path to data to process. If not set, synthetic data is used. To
use Imagenet data use these
[instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started)
as a starting point.
* batch_size: Batch size for each GPU.
* variable_update: The method for managing variables: parameter_server
,replicated, distributed_replicated, independent
* local_parameter_device: Device to use as parameter server: cpu or gpu.
To see the full list of flags, run `python tf_cnn_benchmarks.py --help`.
To run ResNet50 with real data with 8 GPUs, run:
```
python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=256 \
--model=resnet50 --optimizer=momentum --variable_update=replicated \
--nodistortions --gradient_repacking=8 --num_gpus=8 \
--num_epochs=90 --weight_decay=1e-4 --data_dir=${DATA_DIR} --use_fp16 \
--train_dir=${CKPT_DIR}
```
This will train a ResNet-50 model on ImageNet with 2048 batch size on 8
GPUs. The model should train to around 76% accuracy.
## Running the tests
To run the tests, run
```bash
pip install portpicker
python run_tests.py && python run_tests.py --run_distributed_tests
```
Note the tests require portpicker.
The command above runs a subset of tests that is both fast and fairly
comprehensive. Alternatively, all the tests can be run, but this will take a
long time:
```bash
python run_tests.py --full_tests && python run_tests.py --full_tests --run_distributed_tests
```
We will run all tests on every PR before merging them, so it is not necessary
to pass `--full_tests` when running tests yourself.
To run an individual test, such as method `testParameterServer` of test class
`TfCnnBenchmarksTest` of module `benchmark_cnn_test`, run
```bash
python -m unittest -v benchmark_cnn_test.TfCnnBenchmarksTest.testParameterServer
```
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Benchmarks the all-reduce algorithms of tf_cnn_benchmarks.
tf_cnn_benchmarks uses all-reduce to aggregate gradients. This benchmark is
useful for benchmarking the performance of just this gradient aggregation,
instead of the entire model. All the flags that tf_cnn_benchmarks accepts are
also accepted by this script, although many are silently ignored.
The number and shapes of the tensors all-reduced are those of the variables of
the model specified by the --model flag.
TODO(reedwm): Allow custom sizes to be specified.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from absl import app
from absl import flags as absl_flags
import tensorflow.compat.v1 as tf
from tensorflow.python.ops import control_flow_ops
import benchmark_cnn
import cnn_util
import flags
from cnn_util import log_fn
absl_flags.DEFINE_integer('iters_per_step', 5,
'Number of iterations to run all-reduce for, per '
'step. Every step, a session will be run on a Graph '
'that contains this many copies of the all-reduce. '
'The copies are run sequentially. Setting this above '
'1 is useful to lower the overhead of starting the '
'session run, running the VariableV2 ops at the '
'start of the step, etc.')
flags.define_flags()
for name in flags.param_specs.keys():
absl_flags.declare_key_flag(name)
def get_var_shapes(model):
"""Returns the list of variable shapes for a tf_cnn_benchmarks Model."""
with tf.Graph().as_default():
# The variable shapes do not depend on the batch size.
images = tf.placeholder(tf.float32, model.get_input_shapes('train')[0])
model.build_network([images])
return [[int(d) for d in v.shape.dims] for v in tf.trainable_variables()]
def all_reduce(all_device_tensors, variable_mgr):
"""Performs a single batch all-reduce.
Args:
all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
a tensor, where t is the tower the tensor is on and i is the index of
the tensor.
variable_mgr: The VariableMgr to perform the all-reduce.
Returns:
List of list of tensors in the same form as `all_device_tensors`, except the
tensors are aggregated across towers.
"""
tower_grads = [[(g, None) for g in device_tensors] for
device_tensors in all_device_tensors]
_, aggregated_tower_grads = variable_mgr.preprocess_device_grads(tower_grads)
return [
[g for g, _ in agg_device_tensors]
for agg_device_tensors in aggregated_tower_grads]
def build_all_reduce_iterations(all_device_tensors, tower_devices, variable_mgr,
num_iters):
"""Builds the all-reduce ops for multiple iterations to aggregate tensors.
The tensors in `all_device_tensors` are aggregated `num_iters` times. Each
iteration aggregates the results from the previous iteration. The iterations
are run sequentially, so the aggregations for an iteration do not start
running until the previous iteration has completed. Each iteration after the
first is aggregating already-aggregated values, but it does not matter because
we are only aggregating for benchmarking purposes.
Args:
all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
a tensor, where t is the tower the tensor is on and i is the index of
the tensor.
tower_devices: A list of device strings. tower_devices[t] is the device
of the tensors in all_device_tensors[t].
variable_mgr: The VariableMgr to perform the all-reduce.
num_iters: Number of iterations to aggregate tensors for.
Returns:
An op that when run, causes the all-reduce ops to run.
"""
for i in range(num_iters):
with tf.name_scope('iteration_%d' % i):
# Step 1: Do the aggregation.
with tf.name_scope('tensor_aggregation'):
all_device_tensors = all_reduce(all_device_tensors, variable_mgr)
# Step 2. Create identity ops, to bring the aggregated results back to
# each device.
new_all_device_tensors = []
for device, device_tensors in zip(tower_devices, all_device_tensors):
with tf.device(device):
new_all_device_tensors.append([
tf.identity(t, name='identity_after_allreduce')
for t in device_tensors
])
all_device_tensors = new_all_device_tensors
# Step 3. Add control dependencies to delay the next iteration until this
# iteration is complete. To avoid extra overhead, we do not have any
# cross-device control dependencies, which means it's possible for two
# iterations to slightly overlap.
new_all_device_tensors = []
for device_tensors in all_device_tensors:
new_all_device_tensors.append([
control_flow_ops.with_dependencies(
device_tensors, t, name='identity_after_dependencies')
for t in device_tensors
])
all_device_tensors = new_all_device_tensors
# To prevent the dependency optimizer from removing every op we created,
# we store the results in variables.
ops_to_run = []
for device, device_tensors in zip(tower_devices, all_device_tensors):
with tf.device(device):
for t in device_tensors:
# The placeholder initial value is never run.
var = tf.Variable(tf.placeholder(tf.float32, t.shape), collections=[])
ops_to_run.append(var.assign(t))
return tf.group(*ops_to_run)
def build_graph(tower_devices, tensor_shapes, variable_mgr, num_iters):
"""Builds the graph for the benchmark.
Args:
tower_devices: A list of device strings of the devices to run the all-reduce
benchmark on.
tensor_shapes: A list of shapes of the tensors that will be aggregated for
the all-reduce.
variable_mgr: The VariableMgr to perform the all-reduce.
num_iters: Number of iterations to aggregate tensors for.
Returns:
An op that runs the benchmark.
"""
all_device_tensors = []
for i, tower_device in enumerate(tower_devices):
with tf.device(tower_device):
device_tensors = []
for j, shape in enumerate(tensor_shapes):
tensor = tf.Variable(tf.random_normal(shape, dtype=tf.float32),
name='tensor_%d_on_device_%d' % (j, i))
device_tensors.append(tensor)
all_device_tensors.append(device_tensors)
log_fn('Building all-reduce ops')
benchmark_op = build_all_reduce_iterations(all_device_tensors, tower_devices,
variable_mgr, num_iters)
log_fn('Done building all-reduce ops')
return benchmark_op
def run_graph(benchmark_op, bench_cnn, init_ops, dummy_loss_op):
"""Runs the graph for the benchmark.
Args:
benchmark_op: An op that runs the benchmark.
bench_cnn: The BenchmarkCNN where params and other attributes are obtained.
init_ops: A list of ops that are run before `benchmark_op` for
initialization.
dummy_loss_op: Any op. We must pass a loss op to
`benchmark_cnn.benchmark_one_step`, but the result of the op is never
actually used.
"""
config = benchmark_cnn.create_config_proto(bench_cnn.params)
with tf.Session(config=config) as sess:
for op in init_ops:
sess.run(op)
step_train_times = []
fetches = {'average_loss': dummy_loss_op, 'benchmark_op': benchmark_op}
log_fn('Running warmup')
for i in range(-bench_cnn.num_warmup_batches, bench_cnn.num_batches):
if i == 0:
log_fn('Running all-reduce ops')
start = time.perf_counter()
if i > 0 and i % bench_cnn.params.display_every == 0:
log_fn('Iteration: %d. Average time per step so far: %s' %
(i, (time.perf_counter() - start) / i))
# Call benchmark_one_step instead of directly calling sess.run(...), to
# potentially get a trace file, partitioned graphs, etc.
benchmark_cnn.benchmark_one_step(
sess=sess,
fetches=fetches,
step=i,
# The batch size is only used for the images/sec calculation, which is
# not actually calculated because we pass show_images_per_sec=False.
batch_size=None,
step_train_times=step_train_times,
trace_filename=bench_cnn.trace_filename,
partitioned_graph_file_prefix=(
bench_cnn.params.partitioned_graph_file_prefix),
profiler=None,
image_producer=None,
params=bench_cnn.params,
show_images_per_sec=False)
log_fn('Average time per step: %s' %
((time.perf_counter() - start) / bench_cnn.num_batches))
def run_benchmark(bench_cnn, num_iters):
"""Runs the all-reduce benchmark.
Args:
bench_cnn: The BenchmarkCNN where params, the variable manager, and other
attributes are obtained.
num_iters: Number of iterations to do all-reduce for for.
Raises:
ValueError: Invalid params of bench_cnn.
"""
if bench_cnn.params.variable_update != 'replicated':
raise ValueError('--variable_update=replicated must be specified to use'
'the all-reduce benchmark')
if bench_cnn.params.variable_consistency == 'relaxed':
raise ValueError('--variable_consistency=relaxed is not supported')
benchmark_op = build_graph(bench_cnn.raw_devices,
get_var_shapes(bench_cnn.model),
bench_cnn.variable_mgr, num_iters)
init_ops = [
tf.global_variables_initializer(),
bench_cnn.variable_mgr.get_post_init_ops()
]
loss_op = tf.no_op()
if bench_cnn.graph_file:
path, filename = os.path.split(bench_cnn.graph_file)
as_text = filename.endswith('txt')
log_fn('Writing GraphDef as %s to %s' % (
'text' if as_text else 'binary', bench_cnn.graph_file))
tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
path, filename, as_text)
run_graph(benchmark_op, bench_cnn, init_ops, loss_op)
# TODO(reedwm): Reduce redundancy with tf_cnn_benchmarks
def main(positional_arguments):
# Command-line arguments like '--distortions False' are equivalent to
# '--distortions=True False', where False is a positional argument. To prevent
# this from silently running with distortions, we do not allow positional
# arguments.
assert len(positional_arguments) >= 1
if len(positional_arguments) > 1:
raise ValueError('Received unknown positional arguments: %s'
% positional_arguments[1:])
params = benchmark_cnn.make_params_from_flags()
params = benchmark_cnn.setup(params)
bench = benchmark_cnn.BenchmarkCNN(params)
tfversion = cnn_util.tensorflow_version_tuple()
log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1]))
run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
if __name__ == '__main__':
tf.disable_v2_behavior()
app.run(main) # Raises error on invalid flags, unlike tf.app.run()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for all_reduce_benchmark.py."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow.compat.v1 as tf
import all_reduce_benchmark
import benchmark_cnn
import test_util
class AllReduceBenchmarkTest(tf.test.TestCase):
"""Tests the all-reduce benchmark."""
def _test_run_benchmark(self, params):
"""Tests that run_benchmark() runs successfully with the params."""
logs = []
with test_util.monkey_patch(all_reduce_benchmark,
log_fn=test_util.print_and_add_to_list(logs)):
bench_cnn = benchmark_cnn.BenchmarkCNN(params)
all_reduce_benchmark.run_benchmark(bench_cnn, num_iters=5)
self.assertRegex(logs[-1], '^Average time per step: [0-9.]+$')
def test_run_benchmark(self):
"""Tests that run_benchmark() runs successfully."""
params = benchmark_cnn.make_params(num_batches=10,
variable_update='replicated',
num_gpus=2)
self._test_run_benchmark(params)
params = params._replace(hierarchical_copy=True, gradient_repacking=8,
num_gpus=8)
self._test_run_benchmark(params)
if __name__ == '__main__':
tf.disable_v2_behavior()
tf.test.main()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for allreduce."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections as pycoll
import re
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf
# pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
try:
from tensorflow.python.distribute.v1 import all_reduce
except ImportError:
# Compatibility with TF 2.4 and below
from tensorflow.python.distribute import all_reduce
from tensorflow.python.framework import device as pydev
from tensorflow.python.framework import ops
from tensorflow.python.ops import collective_ops
AllReduceSpecTuple = pycoll.namedtuple('AllReduceSpecTuple', 'alg shards limit')
def parse_general_int(s):
"""Parse integer with power-of-2 suffix eg. 32k."""
mo = re.match(r'(\d+)([KkMGT]?)$', s)
if mo:
i, suffix = mo.group(1, 2)
v = int(i)
if suffix:
if suffix == 'K' or suffix == 'k':
v *= 1024
elif suffix == 'M':
v *= (1024 * 1024)
elif suffix == 'G':
v *= (1024 * 1024 * 1024)
elif suffix == 'T':
v *= (1024 * 1024 * 1024 * 1024)
else:
raise ValueError('invalid integer string %s' % s)
return v
else:
v = int(s)
return v
def parse_all_reduce_spec(all_reduce_spec):
"""Parse all_reduce_spec.
Args:
all_reduce_spec: a string specifying a combination of all-reduce
algorithms to apply for gradient reduction.
Returns:
a list of AllReduceSpecTuple.
Raises:
ValueError: all_reduce_spec is not well-formed.
An all_reduce_spec has BNF form:
int ::= positive whole number
g_int ::= int[KkMGT]?
alg_spec ::= alg | alg#int
range_spec ::= alg_spec | alg_spec/alg_spec
spec ::= range_spec | range_spec:g_int:range_spec
Not all syntactically correct specifications are supported.
Examples of supported all_reduce_spec strings, with semantics explained:
'collective' == apply tf.collective_reduce operator to all tensors.
'collective#2' == apply tf.collective_reduce operator to all tensors,
requesting up to 2 simultaneous transfers at each node, if
feasible, by subdividing tensor by an additional factor of 2.
'xring' == apply ring all-reduce to all tensors
'xring#2' == apply ring all-reduce to all tensors, using two simultaneous
transfer rings, each operating on 1/2 of each tensor.
'nccl' == apply NCCL all-reduce to all tensors (only works within
a single worker process where all devices are GPUs)
'nccl/xring' == apply NCCL all-reduce to all tensors within each worker
to produce at least one full-reduced (locally) value,
then apply ring all-reduce to one such value from each
worker, then apply NCCL broadcast to propagate those globally
reduced values back to every device within each worker.
'pscpu' == Shuffle reduce using worker CPUs as the gather devices: each
distributed tensor is reduced by copying all instances to
one of the worker CPUs, computing the reduction there, then
copying back to each participating device. Tensor reductions
are assigned to specific CPUs round-robin.
'psgpu#4' == Arrange all GPUs across all workers into groups of 4.
Each distributed tensor is shuffle reduced against one
such group of 4 GPUs, selected round-robin. That is, each
tensor is split across 4 shards for the reduction.
'pscpu:2k:pscpu#2:64k:xring' == Apply single-shard pscpu to
tensors of size <= 2048 elements, apply 2-shard pscpu to
tensors up to size 64k elements, apply xring to larger tensors.
'pscpu/pscpu#2' == Use shuffle gather to locally reduce each tensor on
the worker's CPU, then use 2-shard shuffle to reduce those
locally reduced tensors across workers (on the worker CPUs), then
scatter the globally reduced values locally from each worker CPU.
"""
range_parts = all_reduce_spec.split(':') + ['-1']
if len(range_parts) % 2:
raise ValueError('all_reduce_spec not well formed: %s' % all_reduce_spec)
limit = 0
spec = []
alg = None
shards = 1
for i, range_part in enumerate(range_parts):
if i % 2 == 1:
try:
limit = parse_general_int(range_part)
spec.append(AllReduceSpecTuple(alg=alg, shards=shards, limit=limit))
except ValueError:
raise ValueError('all_reduce_spec (%s) contains non-integer range %s' %
(all_reduce_spec, range_part))
else:
alg = range_part
alg_parts = range_part.split('#')
alg = alg_parts[0]
if len(alg_parts) > 1:
try:
shards = int(alg_parts[1])
except ValueError:
raise ValueError('all_reduce_spec (%s) contains non-integer '
'shards %s' % all_reduce_spec, alg_parts[1])
else:
shards = 1
if alg not in [
'nccl', 'nccl/xring', 'nccl/rechd', 'nccl/pscpu', 'xring', 'pscpu',
'psgpu', 'pscpu/pscpu', 'collective'
]:
raise ValueError('all_reduce_spec (%s) contains invalid alg %s' %
(all_reduce_spec, alg))
return spec
def build_all_reduce_device_prefixes(job_name, num_tasks):
"""Build list of device prefix names for all_reduce.
Args:
job_name: 'worker', 'ps' or 'localhost'.
num_tasks: number of jobs across which device names should be generated.
Returns:
A list of device name prefix strings. Each element spells out the full
host name without adding the device.
e.g. '/job:worker/task:0'
"""
if job_name != 'localhost':
return ['/job:%s/task:%d' % (job_name, d) for d in range(0, num_tasks)]
else:
assert num_tasks == 1
return ['/job:%s' % job_name]
def group_device_names(devices, group_size):
"""Group device names into groups of group_size.
Args:
devices: list of strings naming devices.
group_size: int >= 1
Returns:
list of lists of devices, where each inner list is group_size long,
and each device appears at least once in an inner list. If
len(devices) % group_size = 0 then each device will appear
exactly once.
Raises:
ValueError: group_size > len(devices)
"""
num_devices = len(devices)
if group_size > num_devices:
raise ValueError('only %d devices, but group_size=%d' % (num_devices,
group_size))
num_groups = (
num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
groups = [[] for i in range(num_groups)]
for i in range(0, num_groups * group_size):
groups[i % num_groups].append(devices[i % num_devices])
return groups
def split_grads_by_size(threshold_size, device_grads):
"""Break gradients into two sets according to tensor size.
Args:
threshold_size: int size cutoff for small vs large tensor.
device_grads: List of lists of (gradient, variable) tuples. The outer
list is over devices. The inner list is over individual gradients.
Returns:
small_grads: Subset of device_grads where shape is <= theshold_size
elements.
large_grads: Subset of device_grads where shape is > threshold_size
elements.
"""
small_grads = []
large_grads = []
for dl in device_grads:
small_dl = []
large_dl = []
for (g, v) in dl:
tensor_size = g.get_shape().num_elements()
if tensor_size <= threshold_size:
small_dl.append([g, v])
else:
large_dl.append([g, v])
if small_dl:
small_grads.append(small_dl)
if large_dl:
large_grads.append(large_dl)
return small_grads, large_grads
_instance_key = 1
def new_collective_instance_key():
"""Returns a new instance key for use in defining a collective op."""
global _instance_key
v = _instance_key
_instance_key += 1
return v
_group_key = 1
_group_key_table = dict()
def collective_group_key(devices):
"""Returns a group key for the set of devices.
Args:
devices: list of strings naming devices in a collective group.
Returns:
int key uniquely identifying the set of device names.
"""
global _group_key
global _group_key_table
parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
concat = ','.join(names)
if concat not in _group_key_table.keys():
new_key = _group_key
_group_key += 1
_group_key_table[concat] = new_key
rv = _group_key_table[concat]
return rv
def build_collective_reduce(input_tensors, num_workers, num_shards,
red_op='Add', un_op='Id'):
"""Build a subgraph that does one full all-reduce, using the collective Op.
Args:
input_tensors: tensors within a single worker graph that are to be reduced
together; must be one per device.
num_workers: total number of workers with identical independent graphs that
will be doing this same reduction. The reduction will actually include
the corresponding tensors at all these workers.
num_shards: number of shards into which to divide each per-tick chunk,
normally 1 but could be higher on multi-data-path architectures.
red_op: string naming the reduction op
un_op: string naming the unary final op
Returns:
An array of final tensors, one per device, computed by the full reduction.
Raises:
ValueError: There must be at least two tensors over all the workers.
"""
group_size = len(input_tensors) * num_workers
if group_size < 2:
raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
devices = [t.device for t in input_tensors]
num_devices = len(devices)
group_key = collective_group_key(devices)
instance_key = new_collective_instance_key()
out_tensors = []
if num_shards == 1:
subdiv_offsets = [0]
elif num_shards == 2:
if num_devices > 1:
subdiv_offsets = [0, -(num_devices // 2)]
else:
subdiv_offsets = [0]
else:
raise ValueError('Unsupported num_shards %d' % num_shards)
for d in range(num_devices):
with ops.device(devices[d]):
reduce_op = collective_ops.all_reduce(input_tensors[d],
group_size, group_key, instance_key,
red_op, un_op,
subdiv_offsets)
out_tensors.append(reduce_op)
return out_tensors
def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
return collective_ops.broadcast_send(t, shape, dtype, group_size, group_key,
instance_key)
def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
return collective_ops.broadcast_recv(shape, dtype, group_size, group_key,
instance_key)
def sum_grad_and_var_all_reduce(single_session,
grad_and_vars,
num_workers,
alg,
gpu_indices,
aux_devices=None,
num_shards=1):
"""Apply all-reduce algorithm over specified gradient tensors."""
scaled_grads = [g for g, _ in grad_and_vars]
if alg == 'collective':
assert not single_session
summed_grads = build_collective_reduce(
scaled_grads, num_workers, num_shards, 'Add', 'Id')
else:
with tf.name_scope('allreduce'):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
if alg == 'nccl':
summed_grads = all_reduce.build_nccl_all_reduce(scaled_grads, tf.add)
elif alg == 'xring':
summed_grads = all_reduce.build_ring_all_reduce(
scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
elif alg == 'nccl/xring':
summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
tf.add)
elif alg == 'nccl/rechd':
summed_grads = all_reduce.build_nccl_then_recursive_hd(
scaled_grads, tf.add)
elif alg == 'nccl/pscpu':
summed_grads = all_reduce.build_nccl_then_shuffle(
scaled_grads, aux_devices, tf.add, tf.add_n)
elif alg == 'pscpu/pscpu':
summed_grads = all_reduce.build_shuffle_then_shuffle(
scaled_grads,
aux_devices,
# TODO(tucker): devise a way of better specifying the device set
# for the second level.
[aux_devices[0]],
tf.add_n)
elif alg in ['pscpu', 'psgpu']:
summed_grads = all_reduce.build_shuffle_all_reduce(
scaled_grads, aux_devices, tf.add_n)
else:
raise ValueError('unsupported all_reduce alg: ', alg)
result = []
for (_, v), g in zip(grad_and_vars, summed_grads):
result.append([g, v])
return result
def contains_any(haystack, needles):
"""Tests if any needle is a substring of haystack.
Args:
haystack: a string
needles: list of strings
Returns:
True if any element of needles is a substring of haystack,
False otherwise.
"""
for n in needles:
if n in haystack:
return True
return False
def sum_gradients_all_reduce(single_session,
dev_prefixes,
tower_grads,
num_workers,
alg,
num_shards,
gpu_indices,
agg_small_grads_max_bytes=0,
agg_small_grads_max_group=10,
allreduce_merge_scope=1):
"""Apply all-reduce algorithm over specified gradient tensors.
Args:
single_session: true if reduction is applied to one graph across
all workers, false if ths application is to a single-worker graph only.
dev_prefixes: list of prefix strings to use to generate PS device names.
tower_grads: the gradients to reduce.
num_workers: number of worker processes across entire job.
alg: the all-reduce algorithm to apply.
num_shards: alg-specific sharding factor.
gpu_indices: indices of local GPUs in order usable for ring-reduce.
agg_small_grads_max_bytes: largest tensor eligible for aggregation,
in number of bytes.
agg_small_grads_max_group: largest permitted aggregation of small
tensors.
allreduce_merge_scope: size of groups into which to partition consecutive
gradients grouped under a common 'allreduce' name scope for application
of ScopedAllocator optimization.
Returns:
list of reduced tensors
"""
alg_contains_shuffle = contains_any(alg, ['pscpu', 'psgpu'])
is_hierarchical = '/' in alg
if 'pscpu' in alg:
aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
elif 'psgpu' in alg:
aux_devices = [
prefix + '/gpu:%d' % i
for i in range(len(gpu_indices))
for prefix in dev_prefixes
]
else:
aux_devices = ['/job:localhost/cpu:0']
aux_device_groups = group_device_names(
aux_devices,
num_shards if (alg != 'collective' and alg_contains_shuffle) else 1)
group_index = 0
if agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
tower_grads, packing = pack_small_tensors(
tower_grads,
max_bytes=agg_small_grads_max_bytes,
max_group=agg_small_grads_max_group)
else:
packing = None
reduced_gv_list = []
gv = list(zip(*tower_grads))
merge_scope = allreduce_merge_scope if allreduce_merge_scope > 0 else 1
chunked_gv = [gv[x:x + merge_scope]
for x in xrange(0, len(gv), merge_scope)]
for chunk in chunked_gv:
with tf.name_scope('allreduce'):
for grad_and_vars in chunk:
reduced_gv_list.append(sum_grad_and_var_all_reduce(
single_session,
grad_and_vars, num_workers, alg, gpu_indices,
(aux_devices if is_hierarchical
else aux_device_groups[group_index]),
num_shards))
group_index = (group_index + 1) % len(aux_device_groups)
new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
if packing:
new_tower_grads = unpack_small_tensors(new_tower_grads, packing)
return new_tower_grads
def extract_ranges(index_list, range_size_limit=32):
"""Extract consecutive ranges and singles from index_list.
Args:
index_list: List of monotone increasing non-negative integers.
range_size_limit: Largest size range to return. If a larger
consecutive range exists it will be returned as multiple
ranges.
Returns:
ranges, singles where ranges is a list of [first, last] pairs of
consecutive elements in index_list, and singles is all of the
other elements, in original order.
"""
if not index_list:
return [], []
first = index_list[0]
last = first
ranges = []
singles = []
for i in index_list[1:]:
if i == last + 1 and (last - first) <= range_size_limit:
last = i
else:
if last > first:
ranges.append([first, last])
else:
singles.append(first)
first = i
last = i
if last > first:
ranges.append([first, last])
else:
singles.append(first)
return ranges, singles
GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
def pack_range(key, packing, grad_vars, rng):
"""Form the concatenation of a specified range of gradient tensors.
Args:
key: Value under which to store meta-data in packing that will be used
later to restore the grad_var list structure.
packing: Dict holding data describing packed ranges of small tensors.
grad_vars: List of (grad, var) pairs for one tower.
rng: A pair of integers giving the first, last indices of a consecutive
range of tensors to be packed.
Returns:
A tensor that is the concatenation of all the specified small tensors.
"""
to_pack = grad_vars[rng[0]:rng[1] + 1]
members = []
variables = []
restore_shapes = []
with tf.name_scope('pack'):
for g, v in to_pack:
variables.append(v)
restore_shapes.append(g.shape)
with tf.device(g.device):
members.append(tf.reshape(g, [-1]))
packing[key] = GradPackTuple(
indices=range(rng[0], rng[1] + 1),
vars=variables,
shapes=restore_shapes)
with tf.device(members[0].device):
return tf.concat(members, 0)
def unpack_grad_tuple(gv, gpt):
"""Unpack a previously packed collection of gradient tensors.
Args:
gv: A (grad, var) pair to be unpacked.
gpt: A GradPackTuple describing the packing operation that produced gv.
Returns:
A list of (grad, var) pairs corresponding to the values that were
originally packed into gv, maybe following subsequent operations like
reduction.
"""
elt_widths = [x.num_elements() for x in gpt.shapes]
with tf.device(gv[0][0].device):
with tf.name_scope('unpack'):
splits = tf.split(gv[0], elt_widths)
unpacked_gv = []
for idx, s in enumerate(splits):
unpacked_gv.append((tf.reshape(s, gpt.shapes[idx]), gpt.vars[idx]))
return unpacked_gv
def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
"""Concatenate small gradient tensors together for reduction.
Args:
tower_grads: List of lists of (gradient, variable) tuples.
max_bytes: Int giving max number of bytes in a tensor that
may be considered small.
max_group: Int giving max number of small tensors that may be
concatenated into one new tensor.
Returns:
new_tower_grads, packing where new_tower_grads is identical to
tower_grads except that all feasible small_tensors have been removed
from their places and concatenated into larger tensors that are
now in the front of the list for each tower, and packing contains
the data necessary to restore the tower_grads structure.
Look through the first tower for gradients of the same type (float),
and small size, that are all sequential. For each such group,
replace by a new tensor that is a flattened concatenation. Note
that the corresponding variable will be absent, which doesn't matter
because it isn't used during all-reduce.
Requires:
Every gv_list in towers must have isomorphic structure including identical
tensor sizes and types.
"""
small_indices = []
large_indices = []
for idx, (g, _) in enumerate(tower_grads[0]):
if g.dtype == tf.float32 and (4 * g.shape.num_elements()) <= max_bytes:
small_indices.append(idx)
else:
large_indices.append(idx)
small_ranges, small_singles = extract_ranges(
small_indices, range_size_limit=max_group)
large_indices = sorted(large_indices + small_singles)
num_gv = len(tower_grads[0])
packing = {}
if small_ranges:
new_tower_grads = []
for dev_idx, gv_list in enumerate(tower_grads):
assert len(gv_list) == num_gv
new_gv_list = []
for r in small_ranges:
key = '%d:%d' % (dev_idx, len(new_gv_list))
new_gv_list.append((pack_range(key, packing, gv_list, r),
'packing_var_placeholder'))
for i in large_indices:
new_gv_list.append(gv_list[i])
new_tower_grads.append(new_gv_list)
return new_tower_grads, packing
else:
return tower_grads, None
def unpack_small_tensors(tower_grads, packing):
"""Undo the structure alterations to tower_grads done by pack_small_tensors.
Args:
tower_grads: List of List of (grad, var) tuples.
packing: A dict generated by pack_small_tensors describing the changes
it made to tower_grads.
Returns:
new_tower_grads: identical to tower_grads except that concatentations
of small tensors have been split apart and returned to their original
positions, paired with their original variables.
"""
if not packing:
return tower_grads
new_tower_grads = []
num_devices = len(tower_grads)
num_packed = len(packing.keys()) // num_devices
for dev_idx, gv_list in enumerate(tower_grads):
new_gv_list = gv_list[num_packed:]
for i in xrange(0, num_packed):
k = '%d:%d' % (dev_idx, i)
gpt = packing[k]
gv = unpack_grad_tuple(gv_list[i], gpt)
for gi, idx in enumerate(gpt.indices):
assert idx == gpt.indices[gi]
new_gv_list.insert(idx, gv[gi])
new_tower_grads.append(new_gv_list)
return new_tower_grads
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tf_cnn_benchmark.allreduce."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections as pycoll
import numpy as np
import tensorflow.compat.v1 as tf
from tensorflow.python.framework import ops
from tensorflow.python.framework import test_util
from tensorflow.python.ops import variables
import allreduce
class AllReduceTest(tf.test.TestCase):
def testGroupKey(self):
d0 = ['/job:worker/replica:0/task:0/device:GPU:1',
'/job:worker/replica:0/task:0/device:GPU:0',
'/job:worker/replica:0/task:0/device:GPU:3',]
d1 = ['/job:worker/replica:0/task:1/device:GPU:1',
'/job:worker/replica:0/task:1/device:GPU:0',
'/job:worker/replica:0/task:1/device:GPU:3',]
d2 = ['/job:worker/replica:0/task:1/device:GPU:1',
'/job:worker/replica:0/task:1/device:GPU:3',
'/job:worker/replica:0/task:1/device:GPU:0',]
d3 = ['/job:worker/replica:0/task:1/device:GPU:1',
'/job:worker/replica:0/task:1/device:GPU:3',
'/job:worker/replica:0/task:1/device:GPU:2',]
d4 = ['/job:worker/task:0/device:GPU:1',
'/job:worker/task:0/device:GPU:2',
'/job:worker/task:0/device:GPU:3',]
d5 = ['/job:worker/task:0/device:CPU:1',
'/job:worker/task:0/device:CPU:2']
d6 = ['/job:worker/task:0/device:CPU:2',
'/job:worker/task:0/device:CPU:1']
g0 = allreduce.collective_group_key(d0)
g1 = allreduce.collective_group_key(d1)
g2 = allreduce.collective_group_key(d2)
g3 = allreduce.collective_group_key(d3)
g4 = allreduce.collective_group_key(d4)
g5 = allreduce.collective_group_key(d5)
g6 = allreduce.collective_group_key(d6)
self.assertEqual(g0, g1)
self.assertEqual(g0, g2)
self.assertNotEqual(g0, g3)
self.assertEqual(g3, g4)
self.assertEqual(g5, g6)
self.assertNotEqual(g4, g5)
def testExtractRanges(self):
x = []
expected_ranges = []
expected_singles = []
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
x = [1, 3, 4, 6, 7, 8, 9]
expected_ranges = [[3, 4], [6, 9]]
expected_singles = [1]
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
x = [1, 2, 3, 4, 6, 7, 8, 9]
expected_ranges = [[1, 4], [6, 9]]
expected_singles = []
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
x = [1, 3, 4, 6, 7, 9]
expected_ranges = [[3, 4], [6, 7]]
expected_singles = [1, 9]
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
x = [1, 3, 6, 9]
expected_ranges = []
expected_singles = [1, 3, 6, 9]
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
def testPackRange(self):
packing = {}
t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
gv = [(t0, 'v0'), (t1, 'v1')]
new_t = allreduce.pack_range('0:0', packing, gv, [0, 1])
self.assertEqual(1, new_t.shape.ndims)
self.assertEqual(8, new_t.shape.dims[0])
self.assertEqual(
packing, {
'0:0':
allreduce.GradPackTuple(
indices=range(2),
vars=['v0', 'v1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])])
})
t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
gv = [(t0, 'v0'), (t1, 'v1'), (t2, 'v2'), (t3, 'v3')]
packing = {}
new_t = allreduce.pack_range('1:0', packing, gv, [0, 3])
self.assertEqual(1, new_t.shape.ndims)
self.assertEqual(26, new_t.shape.dims[0])
self.assertEqual(
packing, {
'1:0':
allreduce.GradPackTuple(
indices=range(4),
vars=['v0', 'v1', 'v2', 'v3'],
shapes=[
tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3]),
tf.TensorShape([3, 3])
])
})
def testUnpackGradTuple(self):
packing = {
'0:0':
allreduce.GradPackTuple(
indices=range(4),
vars=['v0', 'v1', 'v2', 'v3'],
shapes=[
tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3]),
tf.TensorShape([3, 3])
])
}
tc = tf.constant([0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7, 8,
0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
packed_gv = [tc, 'packing_var_placeholder']
gv = allreduce.unpack_grad_tuple(packed_gv, packing['0:0'])
self.assertLen(gv, 4)
self.assertEqual('v0', gv[0][1])
self.assertEqual('v1', gv[1][1])
self.assertEqual('v2', gv[2][1])
self.assertEqual('v3', gv[3][1])
self.assertEqual(1, gv[0][0].shape.ndims)
self.assertEqual(4, gv[0][0].shape.dims[0])
self.assertEqual(1, gv[1][0].shape.ndims)
self.assertEqual(4, gv[1][0].shape.dims[0])
self.assertEqual(2, gv[2][0].shape.ndims)
self.assertEqual(3, gv[2][0].shape.dims[0])
self.assertEqual(3, gv[2][0].shape.dims[1])
def testPackSmallTensors(self):
t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
tower_grads = []
for d in range(0, 3):
gv = [(t0, 'v_%d_0' % d), (t1, 'v_%d_1' %d), (t2, 'v_%d_2' %d),
(t3, 'v_%d_3' % d)]
tower_grads.append(gv)
# 1) Set the size limit so small that nothing gets concatenated.
new_tower_grads, packing = allreduce.pack_small_tensors(
tower_grads, max_bytes=12,
max_group=10)
self.assertEqual(tower_grads, new_tower_grads)
self.assertIs(packing, None)
# 2) Set the size limit so only the first two tensors get concatenated
new_tower_grads, packing = allreduce.pack_small_tensors(
tower_grads, max_bytes=16, # 16 bytes == 4 elements
max_group=10)
self.assertLen(new_tower_grads, 3)
self.assertLen(tower_grads[0], 4)
first_tower = new_tower_grads[0]
self.assertLen(first_tower, 3)
self.assertEqual(1, first_tower[0][0].shape.ndims)
self.assertEqual(8, first_tower[0][0].shape.dims[0])
self.assertEqual(packing,
{'0:0': allreduce.GradPackTuple(
indices=range(2),
vars=['v_0_0', 'v_0_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])]),
'1:0': allreduce.GradPackTuple(
indices=range(2),
vars=['v_1_0', 'v_1_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])]),
'2:0': allreduce.GradPackTuple(
indices=range(2),
vars=['v_2_0', 'v_2_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])])})
# 3) Set the size limit so all tensors get concatenated
new_tower_grads, packing = allreduce.pack_small_tensors(
tower_grads, max_bytes=256, # bytes = 64 elements
max_group=10)
self.assertLen(new_tower_grads, 3)
self.assertLen(tower_grads[0], 4)
self.assertLen(new_tower_grads[0], 1)
first_tower = new_tower_grads[0]
self.assertEqual(1, first_tower[0][0].shape.ndims)
self.assertEqual(26, first_tower[0][0].shape.dims[0])
self.assertEqual(packing,
{'0:0': allreduce.GradPackTuple(
indices=range(4),
vars=['v_0_0', 'v_0_1', 'v_0_2', 'v_0_3'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])]),
'1:0': allreduce.GradPackTuple(
indices=range(4),
vars=['v_1_0', 'v_1_1', 'v_1_2', 'v_1_3'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])]),
'2:0': allreduce.GradPackTuple(
indices=range(4),
vars=['v_2_0', 'v_2_1', 'v_2_2', 'v_2_3'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])])})
def testUnpackSmallTensors(self):
packing = {'0:0': allreduce.GradPackTuple(indices=range(2),
vars=['v_0_0', 'v_0_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])]),
'0:1': allreduce.GradPackTuple(indices=range(3, 5),
vars=['v_0_3', 'v_0_4'],
shapes=[tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])]),
'1:0': allreduce.GradPackTuple(indices=range(2),
vars=['v_1_0', 'v_1_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])]),
'1:1': allreduce.GradPackTuple(indices=range(3, 5),
vars=['v_1_3', 'v_1_4'],
shapes=[tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])])}
t0 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7], dtype=tf.float32)
t1 = tf.constant([17, 17], dtype=tf.float32)
t2 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7, 8,
0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
t3 = tf.constant([0], dtype=tf.float32)
tower_grads = []
for d in range(0, 2):
one_tower = [(t0, 'packing_var_placeholder'),
(t2, 'packing_var_placeholder'),
(t1, 'v_%d_2' % d), (t3, 'v_%d_5' %d)]
tower_grads.append(one_tower)
new_tower_grads = allreduce.unpack_small_tensors(tower_grads, packing)
self.assertLen(new_tower_grads, 2)
for d, tg in enumerate(new_tower_grads):
self.assertLen(tg, 6)
self.assertEqual('v_%d_0' % d, tg[0][1])
self.assertEqual('v_%d_1' % d, tg[1][1])
self.assertEqual('v_%d_2' % d, tg[2][1])
self.assertEqual('v_%d_3' % d, tg[3][1])
self.assertEqual('v_%d_4' % d, tg[4][1])
self.assertEqual('v_%d_5' % d, tg[5][1])
self.assertEqual(1, tg[0][0].shape.ndims)
self.assertEqual(4, tg[0][0].shape.dims[0])
self.assertEqual(1, tg[1][0].shape.ndims)
self.assertEqual(4, tg[1][0].shape.dims[0])
self.assertEqual(1, tg[2][0].shape.ndims)
self.assertEqual(2, tg[2][0].shape.dims[0])
self.assertEqual(2, tg[3][0].shape.ndims)
self.assertEqual(3, tg[3][0].shape.dims[0])
self.assertEqual(3, tg[3][0].shape.dims[1])
self.assertEqual(2, tg[4][0].shape.ndims)
self.assertEqual(3, tg[4][0].shape.dims[0])
self.assertEqual(3, tg[4][0].shape.dims[1])
self.assertEqual(1, tg[5][0].shape.ndims)
self.assertEqual(1, tg[5][0].shape.dims[0])
class DynamicPackingTest(test_util.TensorFlowTestCase):
"""Packing/Unpacking tests that require executing a TensorFlow session."""
def _init_tensors(self, num_towers, tensor_shapes):
"""Construct a collection of tensors across multiple devices."""
num_tensors = len(tensor_shapes)
consts = []
tensors = []
vrbls = []
tower_grads = []
tf.Variable([-1], dtype=tf.int32, name='packing_var_placeholder')
for dev_idx in range(0, num_towers):
devname = '/job:localhost/device:GPU:%d' % dev_idx
consts.append([])
tensors.append([])
vrbls.append([])
with tf.device(devname):
base_value = 0
gv_tuples = []
for t_idx in range(0, num_tensors):
shape = tensor_shapes[t_idx]
num_elts = 0
for d in shape:
num_elts = (num_elts or 1) * d
c = np.fromiter(range(base_value, base_value + num_elts),
dtype=np.float32).reshape(shape)
base_value += num_elts
consts[dev_idx].append(c)
tensors[dev_idx].append(tf.constant(c))
vrbls[dev_idx].append(
tf.Variable(c, name='v_d%d_t%d' % (dev_idx, t_idx)))
gv_tuples.append((tensors[dev_idx][-1], vrbls[dev_idx][-1]))
tower_grads.append(gv_tuples)
return tower_grads, consts, tensors, vrbls
_test_tuple = pycoll.namedtuple('_test_tuple',
'num_devices, in_shapes out_shapes out_i')
def _do_pack_unpack_test(self, tt):
"""Do a single pack-unpack test.
Args:
tt: A _test_tuple defining the parameters of the test to do.
This test executes a graph that performs a pack of tower_grads
followed by an unpack and verifies that the shapes and values
of gradient tensors are unchanged, along with paired variables.
"""
with ops.Graph().as_default():
tower_grads, consts, _, vrbls = self._init_tensors(
tt.num_devices, tt.in_shapes)
packed_tg, packing = allreduce.pack_small_tensors(
tower_grads, max_bytes=40, max_group=10)
unpacked_tg = allreduce.unpack_small_tensors(packed_tg, packing)
with self.test_session() as sess:
sess.run(variables.global_variables_initializer())
packed = sess.run(packed_tg)
for d in range(0, tt.num_devices):
for t in range(0, len(tt.out_shapes)):
num_elts = 0
for dim in tt.out_shapes[t]:
num_elts = (num_elts or 1) * dim
self.assertTrue(np.array_equal(
np.array(range(tt.out_i[t], tt.out_i[t] + num_elts),
dtype=np.float32).reshape(tt.out_shapes[t]),
packed[d][t][0]))
unpacked = sess.run(unpacked_tg)
for d in range(0, tt.num_devices):
for t in range(0, len(tt.in_shapes)):
self.assertTrue(np.array_equal(consts[d][t], unpacked[d][t][0]))
self.assertEqual(vrbls[d][t], unpacked_tg[d][t][1])
def testPackUnpack0(self):
self._do_pack_unpack_test(
self._test_tuple(num_devices=3,
in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
out_shapes=[[17], [12], [5, 5, 5]],
out_i=[0, 17, 29]))
def testPackUnpack1(self):
self._do_pack_unpack_test(
self._test_tuple(num_devices=4,
in_shapes=[[5, 5, 5], [2, 3], [5]],
out_shapes=[[11], [5, 5, 5]],
out_i=[125, 0]))
def testPackUnpack2(self):
self._do_pack_unpack_test(
self._test_tuple(num_devices=2,
in_shapes=[[5, 5, 5], [2, 3], [1, 5], [7], [100]],
out_shapes=[[18], [5, 5, 5], [100]],
out_i=[125, 0, 143]))
def _do_all_reduce_pack_test(self, tt):
"""Test that all-reduce results are the same with or without packing."""
with ops.Graph().as_default():
tower_grads, consts, _, _ = self._init_tensors(
tt.num_devices, tt.in_shapes)
dev_prefixes = ['/job:localhost']
num_workers = 1
alg = 'xring'
shards = 1
single_session = True
gpu_indices = range(0, tt.num_devices)
assert len(gpu_indices) == len(tower_grads)
no_pack_all_reduce = allreduce.sum_gradients_all_reduce(
single_session,
dev_prefixes, tower_grads, num_workers, alg, shards,
gpu_indices,
agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
packed_tg, packing = allreduce.pack_small_tensors(tower_grads, 100, 100)
packed_all_reduce = allreduce.sum_gradients_all_reduce(
single_session,
dev_prefixes, packed_tg, num_workers, alg, shards,
gpu_indices,
agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
unpacked_tg = allreduce.unpack_small_tensors(packed_all_reduce, packing)
with self.test_session() as sess:
sess.run(variables.global_variables_initializer())
no_pack_values = sess.run(no_pack_all_reduce)
pack_unpack_values = sess.run(unpacked_tg)
for d in range(1, tt.num_devices):
for t in range(0, len(tt.in_shapes)):
self.assertTrue(np.allclose(no_pack_values[d][t][0],
tt.num_devices * consts[0][t]))
self.assertTrue(np.array_equal(no_pack_values[d][t][0],
pack_unpack_values[d][t][0]))
def testAllReducePacked0(self):
self._do_all_reduce_pack_test(
self._test_tuple(num_devices=3,
in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
out_shapes=[[17], [12], [5, 5, 5]],
out_i=[0, 17, 29]))
def testAllReducePacked1(self):
self._do_all_reduce_pack_test(
self._test_tuple(num_devices=2,
in_shapes=[[8], [3, 3], [12], [5, 5, 5], [3], [4]],
out_shapes=[[17], [7], [12], [5, 5, 5]],
out_i=[0, 17, 29, 154, 157]))
if __name__ == '__main__':
tf.disable_v2_behavior()
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains classes and functions for doing a single-machine batch all-reduce.
An all-reduce is taking the reduction (typically a sum) of a list of tensors,
each on a different device. The result must end up back on each device, which is
where the word "all" comes from. In summary, each device starts with a single
tensor, and ends up with the reduction of all tensors.
A batch all-reduce is doing several independent all-reduces. When doing a batch
all-reduce, care is taken to evenly distribute the reduction computations
across devices and inter-device tensor transfers across device links.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# TODO(reedwm): Support distributed all-reduces in this file.
# TODO(reedwm): Merge this code with allreduce.py, which contains some batch
# all-reduce code that this file calls. allreduce.py also supports distributed
# batch-reduce while this file only supports single-machine all-reduce.
import abc
import six
import tensorflow.compat.v1 as tf
from tensorflow.python.ops import data_flow_ops
import allreduce
import constants
def _all_reduce_using_copy(tensors_across_devices, use_mean):
"""Does an all-reduce of a list of tensors by copying to the current device.
The tensors are copied to the current device and then reduced.
Args:
tensors_across_devices: A list of tensors, each on a different device.
use_mean: Whether to take the mean of the tensors instead of a sum:
Returns:
A reduced tensor on the current device.
"""
reduced_tensor = tf.add_n(tensors_across_devices)
if use_mean:
reduced_tensor *= 1 / len(tensors_across_devices)
return reduced_tensor
@six.add_metaclass(abc.ABCMeta)
class BatchAllReduceAlgorithm(object):
"""Represents an algorithm for performing a batch all-reduce operation."""
def batch_all_reduce(self,
all_device_tensors,
num_splits,
compact_tensors,
defer_tensors,
xla_compile=False):
"""Performs a batch all-reduce.
The reduction done is a sum.
`all_device_tensors` is a list of list of tensors that will be batch
all-reduced. All tensors within a single inner list must be on the same
device. The nth element in each list, for any n, will be reduced together.
The return value is in the same form as `all_device_tensors`, except that
each tensor is reduced.
For example, if `all_device_tensors` is:
[[ A, B ], # A and B are on GPU 0
[ C, D ]] # C and D are on GPU 1
Then the return value will be:
[[ A+C, B+D ], # These two tensors are on GPU 0
[ A+C, B+D ]] # These two tensors are on GPU 1
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
is a tensor where `i` is the device index and `j` is the tensor index.
num_splits: If not None, tensors will be concatenated and split into this
many pieces during the all-reduce, then split back into their original
shapes afterwards. Has no impact on correctness and can improve
performance. Requires all tensors to be the same type.
compact_tensors: If True, tensors are casted to fp16 before being all-
reduced. Improves performance, but hurts numerical stability.
defer_tensors: If True, every time the return value
`reduced_all_device_tensors` is evaluated, the result will be the
reduced tensors values of `all_device_tensors` from the previous session
run instead of the current session run, or zero on the first session
run. This can improve performance. When training neural networks,
deferring gradients often does not harm training, so this can be used to
improve performance.
xla_compile: If True, use XLA to compile gradients packing and unpacking
ops.
Returns:
reduced_all_device_tensors: A list in the same form as
`all_device_tensors`, except each tensor has been reduced.
warmup_ops: A list of ops needed to be run once before the all-reduce can
occur.
"""
# Before all-reducing tensors, we do several preprocessing functions that
# can speed up the all-reduce. We undo these functions after all-reducing
# the tensors.
# all_device_packed_tensors is a 2-d list of tensors indexed by
# [device_id][tensor_id], holding packed tensors from all devices involved
# in all-reduce.
all_device_packed_tensors = []
# all_device_warmup_ops is a 2-d list of ops indexed by
# [device_id][tensor_id], holding warmup_ops that need to be run once before
# all-reduce can occur.
all_device_warmup_ops = []
# all_device_put_ops is a 2-d list of ops indexed by
# [device_id][tensor_id], holding put ops for deferred tensors. They will be
# called in each all-reduce step automatically due to control dependency.
all_device_put_ops = []
# packers is a list of _TensorPacker, one for each device involved in
# all-reduce.
packers = [
_TensorPacker(num_splits, compact_tensors) for _ in all_device_tensors
]
for packer, device_tensors in zip(packers, all_device_tensors):
def pack_single_device_tensors(packer=packer,
device_tensors=device_tensors):
"""Pack gradient tensors of a device."""
packed_tensors = packer.maybe_concat_tensors(device_tensors)
packed_tensors = packer.maybe_compact_tensors(packed_tensors)
# When xla_compile=False, defer tensors after concat for better
# performance.
if defer_tensors and not xla_compile:
packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
packed_tensors)
all_device_put_ops.append(put_ops)
all_device_warmup_ops.append(warmup_ops)
packed_tensors = packer.maybe_split_tensors(packed_tensors)
return packed_tensors
with tf.device(device_tensors[0].device):
if xla_compile:
packed_tensors = tf.xla.experimental.compile(
pack_single_device_tensors)
# When xla_compile=True, intermediate tensors in packing process are
# not materialized. Thus, we defer tensors after packing process is
# completed instead of in the middle of it.
if defer_tensors:
packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
packed_tensors)
all_device_put_ops.append(put_ops)
all_device_warmup_ops.append(warmup_ops)
else:
packed_tensors = pack_single_device_tensors()
all_device_packed_tensors.append(packed_tensors)
# Perform all-reduce on packed tensors.
all_device_tensors = self._do_batch_all_reduce(all_device_packed_tensors)
all_device_unpacked_tensors = []
for packer, device_tensors in zip(packers, all_device_tensors):
def unpack_single_device_tensors(packer=packer,
device_tensors=device_tensors):
"""Unpack gradient tensors of a device."""
unpacked_tensors = packer.undo_maybe_split_tensors(device_tensors)
unpacked_tensors = packer.undo_maybe_compact_tensors(unpacked_tensors)
unpacked_tensors = packer.undo_maybe_concat_tensors(unpacked_tensors)
return unpacked_tensors
with tf.device(device_tensors[0].device):
if xla_compile:
unpacked_device_tensor = tf.xla.experimental.compile(
unpack_single_device_tensors)
else:
unpacked_device_tensor = unpack_single_device_tensors()
all_device_unpacked_tensors.append(unpacked_device_tensor)
# Note: There is no undo operation for deferring tensors. But we do need to
# call _add_put_op_control_deps at the end if we deferred the tensors.
if defer_tensors:
all_device_unpacked_tensors = _add_put_op_control_deps(
all_device_unpacked_tensors, num_splits, all_device_put_ops)
return all_device_unpacked_tensors, all_device_warmup_ops
@abc.abstractmethod
def _do_batch_all_reduce(self, all_device_tensors):
"""Performs a batch all-reduce.
Unlike `self.batch_all_reduce`, this does not do any preprocessing of the
tensors.
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
is a tensor where `i` is the device index and `j` is the tensor index.
Returns:
reduced_all_device_tensors: A list in the same form as
`all_device_tensors`, except each tensor has been reduced.
"""
pass
class CopyToDeviceAlgorithm(BatchAllReduceAlgorithm):
"""An algorithm that copies tensors to be reduced to a specific device."""
def __init__(self, devices_to_reduce_on, use_mean=False):
self._devices = devices_to_reduce_on
self._use_mean = use_mean
def _do_batch_all_reduce(self, all_device_tensors):
reduced_tensors = []
for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
with tf.device(self._devices[i % len(self._devices)]):
reduced_tensor = _all_reduce_using_copy(tensors_across_devices,
self._use_mean)
reduced_tensors.append(reduced_tensor)
# The tensors will be brought back to each device once they are used.
return [reduced_tensors] * len(all_device_tensors)
class HierarchicalCopyAlgorithm(BatchAllReduceAlgorithm):
"""An algorithm that uses hierarchical copies. This is only optimized for
eight devices connected in NetworkTopology.DGX1 or NetworkTopology.GCP_V100
topology.
"""
def __init__(self, network_topology):
"""Initializer for HierarchicalCopyAlgorithm.
Args:
network_topology: An instance of Enum class constants.NetworkTopology.
"""
self._network_topology = network_topology
def _do_batch_all_reduce(self, all_device_tensors):
avail_devices = [device_tensors[0].device
for device_tensors in all_device_tensors]
reduced_tensors = []
num_devices = len(avail_devices)
group_size = num_devices // 2
for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
group_0_main_device, group_1_main_device = self.__get_main_devices(
i, num_devices)
if group_0_main_device < group_size:
group_0_begin = 0
group_1_begin = group_size
else:
group_0_begin = group_size
group_1_begin = 0
# Reduce the first group.
group_0_tensors = tensors_across_devices[group_0_begin:
group_0_begin + group_size]
with tf.device(avail_devices[group_0_main_device]):
group_0_reduced_tensor = _all_reduce_using_copy(group_0_tensors, False)
# Reduce the second group.
group_1_tensors = tensors_across_devices[group_1_begin:
group_1_begin + group_size]
with tf.device(avail_devices[group_1_main_device]):
group_1_reduced_tensor = _all_reduce_using_copy(group_1_tensors, False)
# Reduce between the groups.
with tf.device(avail_devices[group_0_main_device]):
total_reduced_tensor = _all_reduce_using_copy(
[group_0_reduced_tensor, group_1_reduced_tensor], False)
# Broadcast the result back into the root of each group.
with tf.device(avail_devices[group_0_main_device]):
group_0_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
with tf.device(avail_devices[group_1_main_device]):
group_1_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
reduced_tensors_bcast = []
for j in range(len(tensors_across_devices)):
with tf.device(avail_devices[j]):
# Broadcast the result back to each member in the group from the root.
if (group_0_main_device < group_size) == (j < group_size):
src_device_tensor = group_0_reduced_tensor_bcast
else:
src_device_tensor = group_1_reduced_tensor_bcast
reduced_tensors_bcast.append(tf.identity(src_device_tensor))
reduced_tensors.append(reduced_tensors_bcast)
reduced_tensors = list(zip(*reduced_tensors))
return reduced_tensors
def __get_main_devices(self, tensor_index, num_devices):
"""Returns the pair of main devices to use for initial reduction.
Args:
tensor_index: Index of the current tensor in the list of tensors to copy.
num_devices: Total number of devices.
Returns:
A tuple containing pair of main device indices for the initial
reduction. Then, the first element of the tuple should be used for the
final reduction.
Raises:
ValueError: Invalid input arguments.
"""
if self._network_topology == constants.NetworkTopology.DGX1:
return tensor_index % num_devices, (tensor_index +
(num_devices // 2)) % num_devices
elif self._network_topology == constants.NetworkTopology.GCP_V100:
if num_devices != 8:
raise ValueError('HierarchicalCopy only supports eight devices in %s.' %
self._network_topology)
# TODO(hinsu): Generalize main device indices to handle any other
# isomorphic connection graph that connects two cliques using connections
# other than 0-5 and 2-7.
main_device_pairs = [(0, 5), (2, 7), (5, 0), (7, 2)]
return main_device_pairs[tensor_index % len(main_device_pairs)]
else:
# TODO(reedwm): make this logic more general for arbitrary topology.
raise ValueError(
'HierarchicalCopy is not supported for %s network topology.' %
self._network_topology)
class AllReduceSpecAlgorithm(BatchAllReduceAlgorithm):
"""An algorithm that uses an all reduce spec."""
def __init__(self, all_reduce_spec, gpu_indices, agg_small_grads_max_bytes,
agg_small_grads_max_group):
spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
if len(spec) != 1:
raise ValueError(
'Replicated mode does not support hybrid all-reduce strategies')
self._all_reduce_spec = spec[0]
self._gpu_indices = gpu_indices
self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
self._agg_small_grads_max_group = agg_small_grads_max_group
def _do_batch_all_reduce(self, all_device_tensors):
# TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other
# gradient aggregation code, since gradient aggregation is doing an all
# reduce. Currently, we do gradient repacking in two different places.
# TODO(reedwm): Change the allreduce code to reduce tensors instead of
# tower_grads.
tower_grads = [[(t, None) for t in device_tensors]
for device_tensors in all_device_tensors]
aggregated_device_grads = allreduce.sum_gradients_all_reduce(
False, # single_session
['/job:localhost'],
tower_grads,
1,
self._all_reduce_spec.alg,
self._all_reduce_spec.shards,
self._gpu_indices,
agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
agg_small_grads_max_group=self._agg_small_grads_max_group)
return [[t for t, _ in grad_vars] for grad_vars in aggregated_device_grads]
def algorithm_from_params(params):
"""Returns a BatchAllReduceAlgorithm from a Params tuple."""
if params.all_reduce_spec:
if params.gpu_indices:
gpu_indices = [int(x) for x in params.gpu_indices.split(',')]
else:
gpu_indices = [x for x in range(params.num_gpus)]
return AllReduceSpecAlgorithm(params.all_reduce_spec, gpu_indices,
params.agg_small_grads_max_bytes,
params.agg_small_grads_max_group)
elif params.hierarchical_copy:
return HierarchicalCopyAlgorithm(params.network_topology)
else:
if params.local_parameter_device == 'gpu':
devices_to_reduce_on = ['/gpu:%d' % i for i in range(params.num_gpus)]
else:
devices_to_reduce_on = ['/cpu:0']
return CopyToDeviceAlgorithm(devices_to_reduce_on)
def _apply_to_all_device_tensors(all_device_tensors, apply_func, colocate=True):
"""Applies a function to each tensor in `all_device_tensors`.
A new list of lists of tensors is returned, where every tensor in
`all_device_tensors` has had `apply_func` called on it. `all_device_tensors`
is not modified.
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
a tensor where `i` is the device index and `j` is the tensor index.
apply_func: A function taking in three arguments: tensor, device_index,
tensor_index, and returning a modified tensor.
`tensor` is `all_device_tensors[device_index][tensor_index]`.
colocate: If True, apply_func will be run under context manager colocated
with it's input tensor.
Returns:
A list in the same form as `all_device_tensors`, except each tensor has had
`apply_func` called on it.
"""
new_all_device_tensors = []
for device_index, device_tensors in enumerate(all_device_tensors):
new_device_tensors = []
for tensor_index, t in enumerate(device_tensors):
if colocate:
with tf.colocate_with(t):
new_t = apply_func(t, device_index, tensor_index)
else:
new_t = apply_func(t, device_index, tensor_index)
new_device_tensors.append(new_t)
new_all_device_tensors.append(new_device_tensors)
return new_all_device_tensors
def _defer_tensor(tensor):
"""Defers the retrieval of a tensor.
The tensor is put into a StagingArea, and the return value is the
retrieval of the tensor from the StagingArea. The effect is that the
tensor returned from this function is the tensor that was put in the
StagingArea for the previous Session.run() call.
Args:
tensor: The tensor to defer for one step.
Returns:
deferred_tensor: The tensor deferred for one step.
put_op: An op to put `tensor` in the StagingArea. Must be run every step
that `deferred_tensor` is run.
warmup_op: A warmup op that should be called before the first step. Puts
a zero tensor into the StagingArea.
"""
tensor_stage = data_flow_ops.StagingArea([tensor.dtype], [tensor.shape])
put_op = tensor_stage.put([tensor])
warmup_op = tensor_stage.put([tf.zeros(tensor.shape, dtype=tensor.dtype)])
# Fetch the next tensor to use.
(tensor,) = tensor_stage.get()
return tensor, put_op, warmup_op
def defer_single_device_tensors(device_tensors):
"""Defer tensors (gradients in this case) from a single device.
Args:
device_tensors: A list of gradients tensors from a single device to defer.
Returns:
deferred_tensors: A list of tensors deferred for one step.
put_ops: A list of ops that put `tensors` in the StagingAreas. Must be run
every step that `deferred_tensors` is run.
warmup_ops: Warmup ops that should be called before the first step. Puts
zero tensors into the StagingArea.
"""
put_ops = []
warmup_ops = []
deferred_tensors = []
for tensor in device_tensors:
deferred_tensor, put_op, warmup_op = _defer_tensor(tensor)
deferred_tensors.append(deferred_tensor)
put_ops.append(put_op)
warmup_ops.append(warmup_op)
return deferred_tensors, put_ops, warmup_ops
def _add_put_op_control_deps(all_device_tensors, num_splits, put_ops):
"""Add control dependencies from `put_ops` to `all_device_tensors`.
This should only be called when deferred tensors are being used.
The control dependencies are added so that the put ops are run whenever
`all_device_tensors` is run. That way, the caller does not have to explicitly
run the put ops.
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
a tensor where `i` is the device index and `j` is the tensor index.
num_splits: The number of splits that were used for the all-reduce.
put_ops: A list of put ops from deferring the tensors.
Returns:
A list in the same form as `all_device_tensors`, except each tensor has a
control dependency on an op in `put_ops`.
"""
def apply_func(tensor, device_index, tensor_index):
if num_splits == 0:
deps = [put_ops[device_index][tensor_index]]
else:
deps = put_ops[device_index]
assert len(deps) == 1
with tf.control_dependencies(deps):
return tf.identity(tensor, name='control_dependency')
return _apply_to_all_device_tensors(all_device_tensors, apply_func)
class _TensorPacker(object):
"""Packs and unpacks tensors into groups.
This class first concatenates a set of tensors, then split the concatenated
tensor into a small number of chunks. This is useful for all-reducing tensors,
as doing a small number of all-reduces on large tensors can be faster than
doing a large number of all-reduces on small tensors.
It also provides option to compact tensors by casting them to fp16, for better
all-reduce performance.
This class maintains states of processed tensors like shapes and types. So
each packer can only be used to pack and unpack one list of tensors. If you
need to pack multiple lists of tensors (say from multiple devices), then you
need multiple _TensorPacker object, one for each device.
"""
def __init__(self, num_splits, compact):
"""Initializes the _TensorPacker.
Args:
num_splits: The number of tensors to split the concatenated tensor into.
The batch all-reduce will consist of `num_splits` all-reduces. if None
or zero, tensors are not split or concatenated.
compact: If True, tensors are casted to fp16 during packing and casted
back to their original dtypes during unpacking.
"""
self._num_splits = num_splits
self._compact = compact
self._before_compact_dtypes = []
def maybe_concat_tensors(self, device_tensors):
"""Concatenate tensors into a single tensor."""
if not self._num_splits:
return device_tensors
flat_tensors = [tf.reshape(t, [-1]) for t in device_tensors]
self._orig_shapes = [t.shape for t in device_tensors]
self._orig_sizes = [s.num_elements() for s in self._orig_shapes]
# All shapes must be fully defined.
assert None not in self._orig_sizes
concatenated_grad = tf.concat(flat_tensors, 0)
return [concatenated_grad]
def maybe_split_tensors(self, concatenated_tensor):
"""Split concatenated tensor into `num_splits` pieces."""
if not self._num_splits:
return concatenated_tensor
if len(concatenated_tensor) != 1:
raise RuntimeError('tensors must be concatenated via '
'maybe_concat_tensors() before splitting')
concatenated_tensor = concatenated_tensor[0]
total_tensor_size = concatenated_tensor.shape.num_elements()
split_size = total_tensor_size // self._num_splits
split_size_last = total_tensor_size - split_size * (self._num_splits - 1)
split_sizes = [split_size] * (self._num_splits - 1) + [split_size_last]
tensor_packs = tf.split(concatenated_tensor, split_sizes)
return tensor_packs
def undo_maybe_split_tensors(self, tensor_packs):
"""Undo maybe_split_tensors()."""
if not self._num_splits:
return tensor_packs
return [tf.concat(tensor_packs, 0)]
def undo_maybe_concat_tensors(self, concatenated_tensor):
"""Undo maybe_concat_tensors()."""
if not self._num_splits:
return concatenated_tensor
if len(concatenated_tensor) != 1:
raise RuntimeError(
'undo_maybe_split_tensors() must be called before '
'undo_maybe_concat_tensors when num_splits is greater than 1')
concatenated_tensor = concatenated_tensor[0]
tensors_with_sizes = tf.split(concatenated_tensor,
self._orig_sizes)
tensors_with_shapes = [
tf.reshape(grad, shape) for grad, shape in zip(
tensors_with_sizes, self._orig_shapes)
]
return tensors_with_shapes
def maybe_compact_tensors(self, device_tensors):
"""Cast tensors to fp16 and store their original types."""
if not self._compact:
return device_tensors
if self._before_compact_dtypes:
raise RuntimeError('maybe_compact_tensors can only be called once.')
self._before_compact_dtypes = [t.dtype for t in device_tensors]
compact_tensors = [tf.cast(t, tf.float16) for t in device_tensors]
return compact_tensors
def undo_maybe_compact_tensors(self, compact_tensors):
"""Undo maybe_compact_tensors()."""
if not self._compact:
return compact_tensors
if not self._before_compact_dtypes:
raise RuntimeError('maybe_compact_tensors() must be called before '
'undo_maybe_compact_tensors()')
device_tensors = [
tf.cast(t, dtype)
for t, dtype in zip(compact_tensors, self._before_compact_dtypes)
]
return device_tensors
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TensorFlow benchmark library.
See the README for more information.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
from collections import namedtuple
import contextlib
import math
import multiprocessing
import os
import re
import threading
import time
import traceback
from absl import flags as absl_flags
import numpy as np
import six
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf
# pylint: disable=g-direct-tensorflow-import
import cnn_util
import constants
import datasets
import flags
import mlperf
import variable_mgr
import variable_mgr_util
from cnn_util import log_fn
from models import model_config
from platforms import util as platforms_util
from google.protobuf import text_format
from tensorflow.core.protobuf import rewriter_config_pb2
from tensorflow.python import debug as tf_debug
from tensorflow.python.client import timeline
from tensorflow.python.framework import graph_util
from tensorflow.python.framework import graph_util_impl
from tensorflow.python.framework import importer
from tensorflow.python.ops import data_flow_ops
from tensorflow.python.platform import gfile
from tensorflow.python.util import nest
_DEFAULT_NUM_BATCHES = 100
# GraphInfo encapsulates the tensors/ops that we care about after building a
# graph. We use them to benchmark the graph.
GraphInfo = namedtuple( # pylint: disable=invalid-name
'GraphInfo',
[
# Ops that produce the input batches (before preprocessing).
'input_producer_op',
# Ops that adds the preprocessed images to the staging areas
'enqueue_ops',
# Fetches of sess.run()
'fetches',
# Op that performs synchronization in distributed mode
'execution_barrier',
# The global step variable
'global_step',
# Group of ops that perform per-device initialization work
'local_var_init_op_group',
# Op to produce summaries
'summary_op'
])
# InputProcessingInfo contains various sources of inputs which will be later fed
# into the model. If synthetic data is used, all three fields are None.
InputProcessingInfo = namedtuple(
'InputProcessingInfo',
[
# The first two fields are non-None iff datasets prefetching is not
# used.
# Ops that produce the input batches.
'input_producer_op',
# A list of StagingArea for each device.
'input_producer_stages',
# Input produced using multi device iterator. Non-None iff datasets
# prefetching is used
'multi_device_iterator_input'
])
# TODO(reedwm): add upper_bound and lower_bound to appropriate integer and
# float flags, and change certain string flags to enum flags.
flags.DEFINE_string('model', 'trivial',
'Name of the model to run, the list of supported models '
'are defined in models/model.py')
# The code will first check if it's running under benchmarking mode
# or evaluation mode, depending on 'eval':
# Under the evaluation mode, this script will read a saved model,
# and compute the accuracy of the model against a validation dataset.
# Additional ops for accuracy and top_k predictors are only used under
# this mode.
# Under the benchmarking mode, user can specify whether nor not to use
# the forward-only option, which will only compute the loss function.
# forward-only cannot be enabled with eval at the same time.
flags.DEFINE_boolean('eval', False, 'whether use eval or benchmarking')
flags.DEFINE_integer('eval_interval_secs', 0,
'How often to run eval on saved checkpoints. Usually the '
'same as save_model_secs from the corresponding training '
'run. Pass 0 to eval only once.')
flags.DEFINE_integer('eval_during_training_every_n_steps', None,
'Every n steps during training, pause training, run '
'evaluation, then resume training. Must not be used with '
'--eval, as unlike --eval, this option causes both '
'training and eval to be done. This may take slightly '
'more GPU memory than running just training or evaluation '
'alone. It also may slightly slow down training, even '
'when not taking into account the additional time to '
'evaluate.', lower_bound=1)
flags.DEFINE_float('eval_during_training_every_n_epochs', None,
'After every n training epochs, pause training, run '
'evaluation, then resume training. See '
'--eval_during_training_every_n_steps for more information.')
flags.DEFINE_list('eval_during_training_at_specified_steps', [],
'Specify a list of training steps, pause training at each of '
'these steps, run evaluation, then resume training. See '
'--eval_during_training_every_n_steps for more information.')
flags.DEFINE_list('eval_during_training_at_specified_epochs', [],
'Specify a list of training epochs, pause training after '
'each of these epochs, run evaluation, then resume training. '
'See --eval_during_training_every_n_steps for more '
'information.')
flags.DEFINE_boolean('forward_only', False,
'whether use forward-only or training for benchmarking')
flags.DEFINE_boolean('freeze_when_forward_only', False,
'whether to freeze the graph when in forward-only mode.')
flags.DEFINE_boolean('print_training_accuracy', False,
'whether to calculate and print training accuracy during '
'training')
flags.DEFINE_integer('batch_size', 0, 'batch size per compute device')
flags.DEFINE_integer('eval_batch_size', 0, 'eval batch size per compute device')
flags.DEFINE_integer('batch_group_size', 1,
'number of groups of batches processed in the image '
'producer.')
flags.DEFINE_integer('num_batches', None, 'number of batches to run, excluding '
'warmup. Defaults to %d' % _DEFAULT_NUM_BATCHES)
flags.DEFINE_integer('num_eval_batches', None,
'number of eval batches to run, excluding warmup. '
'Defaults to --num_batches')
flags.DEFINE_float('num_epochs', None,
'number of epochs to run, excluding warmup. '
'This and --num_batches cannot both be specified.')
flags.DEFINE_float('num_eval_epochs', None,
'number of eval epochs to run, excluding warmup. '
'Defaults to --num_epochs')
flags.DEFINE_float('stop_at_top_1_accuracy', None,
'If set, stops training after the evaluation accuracy hits '
'this number. Can only be used with one of the '
'--eval_during_training_* flags.')
flags.DEFINE_boolean('collect_eval_results_async', False,
'If True, start a separate process to postprocess eval '
'results asynchronously. This currently only works with '
'the SSD model.')
flags.DEFINE_integer('num_warmup_batches', None,
'number of batches to run before timing')
flags.DEFINE_integer('autotune_threshold', None,
'The autotune threshold for the models')
# TODO(tucker): change num_gpus to num_devices
flags.DEFINE_integer('num_gpus', 1, 'the number of GPUs to run on')
flags.DEFINE_string('gpu_indices', '', 'indices of worker GPUs in ring order')
flags.DEFINE_integer('display_every', 10,
'Number of local steps after which progress is printed '
'out')
flags.DEFINE_float('display_perf_ewma', None,
'If set, display numbers of images/sec using exponentially '
'weighted moving avearge with the specified weight, which '
'defines how much current value contributes to the reported '
'average. Increasing weight makes the reported performance '
'number reflect more about the real-time speed instead of '
'the entire history', lower_bound=0, upper_bound=1)
flags.DEFINE_string('data_dir', None,
'Path to dataset in TFRecord format (aka Example '
'protobufs). If not specified, synthetic data will be '
'used.')
flags.DEFINE_string('data_name', None,
'Name of dataset: imagenet or cifar10. If not specified, '
'it is automatically guessed based on data_dir.')
flags.DEFINE_string('resize_method', 'bilinear',
'Method for resizing input images: crop, nearest, '
'bilinear, bicubic, area, or round_robin. The `crop` mode '
'requires source images to be at least as large as the '
'network input size. The `round_robin` mode applies '
'different resize methods based on position in a batch in '
'a round-robin fashion. Other modes support any sizes and '
'apply random bbox distortions before resizing (even with '
'distortions=False).')
flags.DEFINE_boolean('distortions', False,
'Enable/disable distortions during image preprocessing. '
'These include bbox and color distortions.')
flags.DEFINE_boolean('use_datasets', True,
'Enable use of datasets for input pipeline')
flags.DEFINE_string('input_preprocessor', 'default',
'Name of input preprocessor. The list of supported input '
'preprocessors are defined in preprocessing.py.')
flags.DEFINE_string('gpu_thread_mode', 'gpu_private',
'Methods to assign GPU host work to threads. '
'global: all GPUs and CPUs share the same global threads; '
'gpu_private: a private threadpool for each GPU; '
'gpu_shared: all GPUs share the same threadpool.')
flags.DEFINE_integer('per_gpu_thread_count', 0,
'The number of threads to use for GPU. Only valid when '
'gpu_thread_mode is not global.')
flags.DEFINE_boolean('hierarchical_copy', False,
'Use hierarchical copies. Currently only optimized for '
'use on a DGX-1 with 8 GPUs and may perform poorly on '
'other hardware. Requires --num_gpus > 1, and only '
'recommended when --num_gpus=8')
# TODO(hinsu): Support auto-detection of the network topology while still
# retaining the ability to specify a particular topology for debugging.
flags.DEFINE_enum(
'network_topology', constants.NetworkTopology.DGX1,
(constants.NetworkTopology.DGX1, constants.NetworkTopology.GCP_V100),
'Network topology specifies the topology used to connect multiple devices. '
'Network topology is used to decide the hierarchy to use for the '
'hierarchical_copy.')
flags.DEFINE_integer('gradient_repacking', 0, 'Use gradient repacking. It '
'currently only works with replicated mode. At the end of '
'each step, it repacks the gradients for more efficient '
'cross-device transportation. A non-zero value specifies '
'the number of split packs that will be formed.',
lower_bound=0)
flags.DEFINE_boolean('compact_gradient_transfer', True, 'Compact gradient'
'as much as possible for cross-device transfer and '
'aggregation.')
flags.DEFINE_enum('variable_consistency', 'strong', ('strong', 'relaxed'),
'The data consistency for trainable variables. With strong '
'consistency, the variable always have the updates from '
'previous step. With relaxed consistency, all the updates '
'will eventually show up in the variables. Likely one step '
'behind.')
flags.DEFINE_boolean('datasets_repeat_cached_sample', False,
'Enable use of a special datasets pipeline that reads a '
'single TFRecord into memory and repeats it infinitely '
'many times. The purpose of this flag is to make it '
'possible to write regression tests that are not '
'bottlenecked by CNS throughput. '
'Use datasets_use_caching to cache input data.')
flags.DEFINE_enum('local_parameter_device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
'Device to use as parameter server: cpu or gpu. For '
'distributed training, it can affect where caching of '
'variables happens.')
flags.DEFINE_enum('device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
'Device to use for computation: cpu or gpu')
flags.DEFINE_enum('data_format', 'NCHW', ('NHWC', 'NCHW'),
'Data layout to use: NHWC (TF native) or NCHW (cuDNN '
'native, requires GPU).')
flags.DEFINE_integer('num_intra_threads', None,
'Number of threads to use for intra-op parallelism. If '
'set to 0, the system will pick an appropriate number. '
'None is the same as 0 except that it disables intra-op '
'parallelism on a GPU.')
flags.DEFINE_integer('num_inter_threads', 0,
'Number of threads to use for inter-op parallelism. If '
'set to 0, the system will pick an appropriate number.')
flags.DEFINE_boolean('use_numa_affinity', False,
'Whether to turn on NUMA affinity for CPU devices. '
'This is probably only useful when --device=cpu.')
flags.DEFINE_string('trace_file', '',
'Enable TensorFlow tracing and write trace to this file.')
flags.DEFINE_boolean('use_chrome_trace_format', True,
'If True, the trace_file, if specified, will be in a '
'Chrome trace format. If False, then it will be a '
'StepStats raw proto.')
_NUM_STEPS_TO_PROFILE = 10
_NUM_OPS_TO_PRINT = 20
flags.DEFINE_string('tfprof_file', None,
'If specified, write a tfprof ProfileProto to this file. '
'The performance and other aspects of the model can then '
'be analyzed with tfprof. See '
'https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/g3doc/command_line.md ' # pylint: disable=line-too-long
'for more info on how to do this. The first %d steps '
'are profiled. Additionally, the top %d most time '
'consuming ops will be printed.\n'
'Note: profiling with tfprof is very slow, but most of the '
'overhead is spent between steps. So, profiling results '
'are more accurate than the slowdown would suggest.' %
(_NUM_STEPS_TO_PROFILE, _NUM_OPS_TO_PRINT))
flags.DEFINE_string('graph_file', None,
'Write the model\'s graph definition to this file. '
'Defaults to binary format unless filename ends in "txt".')
flags.DEFINE_string('partitioned_graph_file_prefix', None,
'If specified, after the graph has been partitioned and '
'optimized, write out each partitioned graph to a file '
'with the given prefix.')
flags.DEFINE_enum('optimizer', 'sgd', ('momentum', 'sgd', 'rmsprop', 'adam'),
'Optimizer to use')
flags.DEFINE_float('init_learning_rate', None,
'Initial learning rate for training.')
flags.DEFINE_string('piecewise_learning_rate_schedule', None,
'Specifies a piecewise learning rate schedule based on the '
'number of epochs. This is the form LR0;E1;LR1;...;En;LRn, '
'where each LRi is a learning rate and each Ei is an epoch '
'indexed from 0. The learning rate is LRi if the '
'E(i-1) <= current_epoch < Ei. For example, if this '
'paramater is 0.3;10;0.2;25;0.1, the learning rate is 0.3 '
'for the first 10 epochs, then is 0.2 for the next 15 '
'epochs, then is 0.1 until training ends.')
flags.DEFINE_float('num_epochs_per_decay', 0,
'Steps after which learning rate decays. If 0, the learning '
'rate does not decay.')
flags.DEFINE_float('learning_rate_decay_factor', 0,
'Learning rate decay factor. Decay by this factor every '
'`num_epochs_per_decay` epochs. If 0, learning rate does '
'not decay.')
flags.DEFINE_float('num_learning_rate_warmup_epochs', 0,
'Slowly increase to the initial learning rate in the first '
'num_learning_rate_warmup_epochs linearly.')
flags.DEFINE_float('minimum_learning_rate', 0,
'The minimum learning rate. The learning rate will '
'never decay past this value. Requires `learning_rate`, '
'`num_epochs_per_decay` and `learning_rate_decay_factor` to '
'be set.')
flags.DEFINE_float('resnet_base_lr', None, "Base learning rate at bs=256. Only "
"relevant when training ResNet and utilizing the model's "
"learning rate heuristic (get_learning_rate).")
flags.DEFINE_float('momentum', 0.9, 'Momentum for training.')
flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')
flags.DEFINE_float('rmsprop_momentum', 0.9, 'Momentum in RMSProp.')
flags.DEFINE_float('rmsprop_epsilon', 1.0, 'Epsilon term for RMSProp.')
flags.DEFINE_float('adam_beta1', 0.9, 'Beta2 term for the Adam optimizer')
flags.DEFINE_float('adam_beta2', 0.999, 'Beta2 term for the Adam optimizer')
flags.DEFINE_float('adam_epsilon', 1e-8, 'Epsilon term for the Adam optimizer')
flags.DEFINE_float('gradient_clip', None,
'Gradient clipping magnitude. Disabled by default.')
flags.DEFINE_float('weight_decay', 0.00004,
'Weight decay factor for training.')
flags.DEFINE_float('gpu_memory_frac_for_testing', 0,
'If non-zero, the fraction of GPU memory that will be used. '
'Useful for testing the benchmark script, as this allows '
'distributed mode to be run on a single machine. For '
'example, if there are two tasks, each can be allocated '
'~40 percent of the memory on a single machine. This is '
'also useful for using unified memory, as this can be set '
'above 1 to oversubscribe the GPU using unified memory.',
lower_bound=0.)
flags.DEFINE_boolean('use_unified_memory', None,
'If True, allocate unified memory enabling larger models '
'to fit in available device RAM.')
flags.DEFINE_boolean('timestamped_allocator', False,
'If True marks free BFCAllocator::Chunks with time '
'at which they are freed which can allow more efficient '
'memory allocation in cases like RDMA networking.')
flags.DEFINE_integer('gpu_kt_max_interval', 0,
'If > 0, the maximum number of GPU Ops that may be queued '
'in a row without also queuing a tracking event.')
flags.DEFINE_integer('gpu_kt_max_bytes', 0,
'If > 0, the maximum number of bytes '
'of GPU memory that may be allocated by sequential '
'GPU Ops without queuing a tracking event.')
flags.DEFINE_integer('gpu_kt_max_pending', 0,
'If > 0 no more than this many GPU tracking events may be '
'outstanding at any time. When this limit is reached '
'launch of additional kernels will stall until an '
'outstanding event completes.')
flags.DEFINE_boolean('use_tf_layers', True,
'If True, use tf.layers for neural network layers. This '
'should not affect performance or accuracy in any way.')
flags.DEFINE_integer('tf_random_seed', 1234,
'The TensorFlow random seed. Useful for debugging NaNs, '
'as this can be set to various values to see if the NaNs '
'depend on the seed.')
flags.DEFINE_string('debugger', None,
'If set, use the TensorFlow debugger. If set to "cli", use '
'the local CLI debugger. Otherwise, this must be in the '
'form hostname:port (e.g., localhost:7007) in which case '
'the experimental TensorBoard debugger will be used')
flags.DEFINE_boolean('use_python32_barrier', False,
'When on, use threading.Barrier at Python 3.2.')
flags.DEFINE_boolean('ml_perf', False,
'When True, change how the Imagenet input pipeline works '
'slightly to meet the MLPerf compliance rules. This slows '
'down the input pipeline. Without this option, at the end '
'of the input pipeline, the image is divided by 127.5, '
'then 1.0 is subtracted from it, bringing the image '
'values from [0, 255] to [-1.0, 1.0]. With this option, '
'each of the three channels (red, green, blue) have the '
'average channel value among all image subtracted from '
'it, and no division is done.')
flags.DEFINE_boolean('datasets_use_prefetch', True,
'Enable use of prefetched datasets for input pipeline. '
'This option is meaningless if use_datasets=False.')
flags.DEFINE_integer('datasets_prefetch_buffer_size', 1,
'Prefetching op buffer size per compute device.')
flags.DEFINE_integer('datasets_num_private_threads', None,
'Number of threads for a private threadpool created for '
'all datasets computation. By default, we pick an '
'appropriate number. If set to 0, we use the default '
'tf-Compute threads for dataset operations.')
flags.DEFINE_boolean('datasets_use_caching', False,
'Cache the compressed input data in memory. This improves '
'the data input performance, at the cost of additional '
'memory.')
flags.DEFINE_integer('datasets_parallel_interleave_cycle_length', None,
'Number of parallel file readers interleaving input data.')
flags.DEFINE_boolean('datasets_sloppy_parallel_interleave', False,
'Allow parallel interleave to depart from deterministic '
'ordering, by temporarily skipping over files whose '
'elements are not readily available. This can increase '
'througput in particular in the presence of stragglers.')
flags.DEFINE_integer('datasets_parallel_interleave_prefetch', None,
'The number of input elements to fetch before they are '
'needed for interleaving.')
flags.DEFINE_integer(
'multi_device_iterator_max_buffer_size', 1,
'Configuration parameter for the MultiDeviceIterator that '
' specifies the host side buffer size for each device.')
# Performance tuning parameters.
flags.DEFINE_boolean('winograd_nonfused', True,
'Enable/disable using the Winograd non-fused algorithms.')
flags.DEFINE_boolean(
'batchnorm_persistent', True,
'Enable/disable using the CUDNN_BATCHNORM_SPATIAL_PERSISTENT '
'mode for batchnorm.')
flags.DEFINE_boolean('sync_on_finish', False,
'Enable/disable whether the devices are synced after each '
'step.')
flags.DEFINE_boolean('staged_vars', False,
'whether the variables are staged from the main '
'computation')
flags.DEFINE_boolean('force_gpu_compatible', False,
'whether to enable force_gpu_compatible in GPU_Options')
flags.DEFINE_boolean('allow_growth', None,
'whether to enable allow_growth in GPU_Options')
flags.DEFINE_boolean('xla', False, 'whether to enable XLA auto-jit compilation')
flags.DEFINE_boolean('xla_compile', False,
'Enable xla to compile the graph. Uncompilable ops will '
'result in fatal errors.')
flags.DEFINE_boolean('fuse_decode_and_crop', True,
'Fuse decode_and_crop for image preprocessing.')
flags.DEFINE_boolean('distort_color_in_yiq', True,
'Distort color of input images in YIQ space.')
flags.DEFINE_boolean('enable_optimizations', True,
'Whether to enable grappler and other optimizations.')
flags.DEFINE_string('rewriter_config', None,
'Config for graph optimizers, described as a '
'RewriterConfig proto buffer.')
flags.DEFINE_enum('loss_type_to_report', 'total_loss',
('base_loss', 'total_loss'),
'Which type of loss to output and to write summaries for. '
'The total loss includes L2 loss while the base loss does '
'not. Note that the total loss is always used while '
'computing gradients during training if weight_decay > 0, '
'but explicitly computing the total loss, instead of just '
'computing its gradients, can have a performance impact.')
flags.DEFINE_boolean('single_l2_loss_op', False,
'If True, instead of using an L2 loss op per variable, '
'concatenate the variables into a single tensor and do a '
'single L2 loss on the concatenated tensor.')
flags.DEFINE_boolean('use_resource_vars', False,
'Use resource variables instead of normal variables. '
'Resource variables are slower, but this option is useful '
'for debugging their performance.')
flags.DEFINE_boolean('compute_lr_on_cpu', False,
'If True, do computations related to learning rate on the '
'CPU instead of the GPU. This will significantly improve '
'XLA performance in some cases.')
flags.DEFINE_boolean('sparse_to_dense_grads', False,
'If True, convert all sparse gradients to dense gradients '
'before passing them to the optimizer to update '
'variables. Only affects models with sparse gradients, '
'which currently is only the NCF model.')
# Performance tuning specific to MKL.
flags.DEFINE_boolean('mkl', False, 'If true, set MKL environment variables.')
flags.DEFINE_integer('kmp_blocktime', 0,
'The time, in milliseconds, that a thread should wait, '
'after completing the execution of a parallel region, '
'before sleeping')
flags.DEFINE_string('kmp_affinity', 'granularity=fine,verbose,compact,1,0',
'Restricts execution of certain threads (virtual execution '
'units) to a subset of the physical processing units in a '
'multiprocessor computer.')
flags.DEFINE_integer('kmp_settings', 1,
'If set to 1, MKL settings will be printed.')
# fp16 parameters. If use_fp16=False, no other fp16 parameters apply.
flags.DEFINE_boolean('use_fp16', False,
'Use 16-bit floats for certain tensors instead of 32-bit '
'floats. This is currently experimental.')
# TODO(reedwm): The default loss scale of 128 causes most models to diverge
# on the second step with synthetic data. Changing the tf.set_random_seed
# call to tf.set_random_seed(1235) or most other seed values causes the
# issue not to occur.
flags.DEFINE_float('fp16_loss_scale', None,
'If fp16 is enabled, the loss is multiplied by this amount '
'right before gradients are computed, then each gradient '
'is divided by this amount. Mathematically, this has no '
'effect, but it helps avoid fp16 underflow. Set to 1 to '
'effectively disable. Ignored during eval.')
flags.DEFINE_boolean('fp16_vars', False,
'If fp16 is enabled, also use fp16 for variables. If '
'False, the variables are stored in fp32 and casted to '
'fp16 when retrieved. Recommended to leave as False.')
flags.DEFINE_boolean('fp16_enable_auto_loss_scale', False,
'If True and use_fp16 is True, automatically adjust the '
'loss scale during training.')
flags.DEFINE_integer('fp16_inc_loss_scale_every_n', 1000,
'If fp16 is enabled and fp16_enable_auto_loss_scale is '
'True, increase the loss scale every n steps.')
# The method for managing variables:
# parameter_server: variables are stored on a parameter server that holds
# the master copy of the variable. In local execution, a local device
# acts as the parameter server for each variable; in distributed
# execution, the parameter servers are separate processes in the
# cluster.
# For each step, each tower gets a copy of the variables from the
# parameter server, and sends its gradients to the param server.
# replicated: each GPU has its own copy of the variables. To apply
# gradients, an all_reduce algorithm or or regular cross-device
# aggregation is used to replicate the combined gradients to all
# towers (depending on all_reduce_spec parameter setting).
# independent: each GPU has its own copy of the variables, and gradients
# are not shared between towers. This can be used to check performance
# when no data is moved between GPUs.
# distributed_replicated: Distributed training only. Each GPU has a copy
# of the variables, and updates its copy after the parameter servers
# are all updated with the gradients from all servers. Only works with
# cross_replica_sync=true. Unlike 'replicated', currently never uses
# nccl all-reduce for replicating within a server.
# distributed_all_reduce: Distributed training where all replicas run
# in a single session, using all-reduce to mutally reduce the
# gradients. Uses no parameter servers. When there is only one
# worker, this is the same as replicated.
# collective_all_reduce: Distributed training where all replicas run
# independepently except for variable initialization and for
# gradient reduction which is done via collective all-reduce.
# NOTE: collective_all_reduce in conjunction with use_fp16 can
# lead to NaNs in some models (resnet50). TODO(tucker): fix it.
# horovod: Distributed training using Horovod library. Runs workers using
# an MPI framework (e.g. Open MPI). Each worker runs training on
# single GPU, and averages gradients using NCCL or MPI all-reduce.
# See https://github.com/uber/horovod for more details.
flags.DEFINE_enum('variable_update', 'parameter_server',
('parameter_server', 'replicated', 'distributed_replicated',
'independent', 'distributed_all_reduce',
'collective_all_reduce', 'horovod'),
'The method for managing variables: parameter_server, '
'replicated, distributed_replicated, independent, '
'distributed_all_reduce, collective_all_reduce, horovod')
flags.DEFINE_string('all_reduce_spec', None,
'A specification of the all_reduce algorithm to be used '
'for reducing gradients. For more details, see '
'parse_all_reduce_spec in variable_mgr.py. An '
'all_reduce_spec has BNF form:\n'
'int ::= positive whole number\n'
'g_int ::= int[KkMGT]?\n'
'alg_spec ::= alg | alg#int\n'
'range_spec ::= alg_spec | alg_spec/alg_spec\n'
'spec ::= range_spec | range_spec:g_int:range_spec\n'
'NOTE: not all syntactically correct constructs are '
'supported.\n\n'
'Examples:\n '
'"xring" == use one global ring reduction for all '
'tensors\n'
'"pscpu" == use CPU at worker 0 to reduce all tensors\n'
'"nccl" == use NCCL to locally reduce all tensors. '
'Limited to 1 worker.\n'
'"nccl/xring" == locally (to one worker) reduce values '
'using NCCL then ring reduce across workers.\n'
'"pscpu:32k:xring" == use pscpu algorithm for tensors of '
'size up to 32kB, then xring for larger tensors.')
# If variable_update==distributed_all_reduce then it may be advantageous
# to aggregate small tensors into one prior to reduction. These parameters
# control that aggregation.
flags.DEFINE_integer('agg_small_grads_max_bytes', 0,
'If > 0, try to aggregate tensors of less than this '
'number of bytes prior to all-reduce.')
flags.DEFINE_integer('agg_small_grads_max_group', 10,
'When aggregating small tensors for all-reduce do not '
'aggregate more than this many into one new tensor.')
flags.DEFINE_integer('allreduce_merge_scope', 1,
'Establish a name scope around this many '
'gradients prior to creating the all-reduce operations. '
'It may affect the ability of the backend to merge '
'parallel ops.')
# Distributed training parameters.
flags.DEFINE_enum('job_name', '', ('ps', 'worker', 'controller', ''),
'One of "ps", "worker", "controller", "". Empty for local '
'training')
flags.DEFINE_string('ps_hosts', '', 'Comma-separated list of target hosts')
flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of target hosts')
flags.DEFINE_string('controller_host', None, 'optional controller host')
flags.DEFINE_integer('task_index', 0, 'Index of task within the job')
flags.DEFINE_string('server_protocol', 'grpc', 'protocol for servers')
flags.DEFINE_boolean('cross_replica_sync', True, '')
flags.DEFINE_string('horovod_device', '', 'Device to do Horovod all-reduce on: '
'empty (default), cpu or gpu. Default with utilize GPU if '
'Horovod was compiled with the HOROVOD_GPU_ALLREDUCE '
'option, and CPU otherwise.')
# Summary and Save & load checkpoints.
flags.DEFINE_integer('summary_verbosity', 0, 'Verbosity level for summary ops. '
'level 0: disable any summary.\n'
'level 1: small and fast ops, e.g.: learning_rate, '
'total_loss.\n'
'level 2: medium-cost ops, e.g. histogram of all '
'gradients.\n'
'level 3: expensive ops: images and histogram of each '
'gradient.\n')
flags.DEFINE_integer('save_summaries_steps', 0,
'How often to save summaries for trained models. Pass 0 '
'to disable summaries.')
flags.DEFINE_integer('save_model_secs', 0,
'How often to save trained models. Pass 0 to disable '
'saving checkpoints every N seconds. A checkpoint is '
'saved after training completes regardless of this '
'option.')
flags.DEFINE_integer('save_model_steps', None,
'How often to save trained models. If specified, '
'save_model_secs must not be specified.')
flags.DEFINE_integer('max_ckpts_to_keep', 5,
'Max number of checkpoints to keep.')
flags.DEFINE_string('train_dir', None,
'Path to session checkpoints. Pass None to disable saving '
'checkpoint at the end.')
flags.DEFINE_string('eval_dir', '/tmp/tf_cnn_benchmarks/eval',
'Directory where to write eval event logs.')
flags.DEFINE_string('backbone_model_path', None,
'Path to pretrained backbone model checkpoint. Pass None '
'if not using a backbone model.')
flags.DEFINE_enum('trt_mode', '', ['', 'FP32', 'FP16', 'INT8'],
'If this is specified in forward_only mode and '
'freeze_when_forward_only is set to True, use TensorRT to '
'optimize the graph before execution.')
flags.DEFINE_integer('trt_max_workspace_size_bytes', 4 << 30,
'Max workspace size bytes used by the TensorRT optimizer.')
# Benchmark logging for model garden metric
flags.DEFINE_string('benchmark_log_dir', None,
'The directory to place the log files containing the '
'results of benchmark. The logs are created by '
'BenchmarkFileLogger. Requires the root of the Tensorflow '
'models repository to be in $PYTHTONPATH.')
flags.DEFINE_string('benchmark_test_id', None,
'The unique test ID of the benchmark run. It could be the '
'combination of key parameters. It is hardware independent '
'and could be used compare the performance between '
'different test runs. This flag is designed for human '
'consumption, and does not have any impact within the '
'system.')
platforms_util.define_platform_params()
class GlobalStepWatcher(threading.Thread):
"""A helper class for global_step.
Polls for changes in the global_step of the model, and finishes when the
number of steps for the global run are done.
"""
def __init__(self, sess, global_step_op, start_at_global_step,
end_at_global_step):
threading.Thread.__init__(self)
self.sess = sess
self.global_step_op = global_step_op
self.start_at_global_step = start_at_global_step
self.end_at_global_step = end_at_global_step
self.start_time = 0
self.start_step = 0
self.finish_time = 0
self.finish_step = 0
def run(self):
while self.finish_time == 0:
time.sleep(.25)
global_step_val, = self.sess.run([self.global_step_op])
if self.start_time == 0 and global_step_val >= self.start_at_global_step:
# Use tf.logging.info instead of log_fn, since print (which is log_fn)
# is not thread safe and may interleave the outputs from two parallel
# calls to print, which can break tests.
tf.logging.info('Starting real work at step %s at time %s' %
(global_step_val, time.ctime()))
self.start_time = time.perf_counter()
self.start_step = global_step_val
if self.finish_time == 0 and global_step_val >= self.end_at_global_step:
tf.logging.info('Finishing real work at step %s at time %s' %
(global_step_val, time.ctime()))
self.finish_time = time.perf_counter()
self.finish_step = global_step_val
def done(self):
return self.finish_time > 0
def num_steps(self):
return self.finish_step - self.start_step
def elapsed_time(self):
return self.finish_time - self.start_time
class CheckpointNotFoundException(Exception):
pass
def create_config_proto(params):
"""Returns session config proto.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
"""
config = tf.ConfigProto()
config.allow_soft_placement = True
if params.num_intra_threads is None:
if params.device == 'gpu':
config.intra_op_parallelism_threads = 1
else:
config.intra_op_parallelism_threads = params.num_intra_threads
config.inter_op_parallelism_threads = params.num_inter_threads
config.experimental.collective_group_leader = '/job:worker/replica:0/task:0'
config.gpu_options.experimental.collective_ring_order = params.gpu_indices
config.gpu_options.force_gpu_compatible = params.force_gpu_compatible
config.experimental.use_numa_affinity = params.use_numa_affinity
if params.device == 'cpu':
# TODO(tucker): change num_gpus to num_devices
config.device_count['CPU'] = params.num_gpus
if params.allow_growth is not None:
config.gpu_options.allow_growth = params.allow_growth
if params.gpu_memory_frac_for_testing > 0:
config.gpu_options.per_process_gpu_memory_fraction = (
params.gpu_memory_frac_for_testing)
if params.use_unified_memory:
config.gpu_options.experimental.use_unified_memory = (
params.use_unified_memory)
if params.timestamped_allocator:
config.gpu_options.experimental.timestamped_allocator = (
params.timestamped_allocator)
if params.gpu_kt_max_interval > 0:
config.gpu_options.experimental.kernel_tracker_max_interval = (
params.gpu_kt_max_interval)
if params.gpu_kt_max_bytes > 0:
config.gpu_options.experimental.kernel_tracker_max_bytes = (
params.gpu_kt_max_bytes)
if params.gpu_kt_max_pending > 0:
config.gpu_options.experimental.kernel_tracker_max_pending = (
params.gpu_kt_max_pending)
if params.xla:
config.graph_options.optimizer_options.global_jit_level = (
tf.OptimizerOptions.ON_1)
if params.rewriter_config:
rewriter_config = rewriter_config_pb2.RewriterConfig()
text_format.Merge(params.rewriter_config, rewriter_config)
config.graph_options.rewrite_options.CopyFrom(rewriter_config)
elif not params.enable_optimizations:
config.graph_options.optimizer_options.opt_level = tf.OptimizerOptions.L0
config.graph_options.rewrite_options.disable_meta_optimizer = True
elif params.variable_update == 'collective_all_reduce':
rewrite_options = config.graph_options.rewrite_options
rewrite_options.scoped_allocator_optimization = (
rewriter_config_pb2.RewriterConfig.ON)
rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
if params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
config.gpu_options.visible_device_list = str(hvd.local_rank())
# For collective_all_reduce, ignore all devices except current worker.
if params.variable_update == 'collective_all_reduce':
del config.device_filters[:]
config.device_filters.append(
'/job:%s/replica:0/task:%d' % (params.job_name, params.task_index))
# TODO(b/117324590): Re-enable PinToHostOptimizer when b/117324590 is fixed.
# Currently we have to disable PinToHostOptimizer w/ XLA since it causes
# OOM/perf cliffs.
config.graph_options.rewrite_options.pin_to_host_optimization = (
rewriter_config_pb2.RewriterConfig.OFF)
return config
def get_mode_from_params(params):
"""Returns the mode in which this script is running.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
Raises:
ValueError: Unsupported params settings.
"""
if params.forward_only and params.eval:
raise ValueError('Only one of forward_only and eval parameters is true')
if params.eval:
return constants.BenchmarkMode.EVAL
elif params.forward_only:
return constants.BenchmarkMode.FORWARD_ONLY
elif (params.eval_during_training_every_n_steps or
params.eval_during_training_every_n_epochs or
params.eval_during_training_at_specified_steps or
params.eval_during_training_at_specified_epochs):
return constants.BenchmarkMode.TRAIN_AND_EVAL
else:
return constants.BenchmarkMode.TRAIN
# How many digits to show for the loss and accuracies during training.
LOSS_AND_ACCURACY_DIGITS_TO_SHOW = 3
def benchmark_one_step(sess,
fetches,
step,
batch_size,
step_train_times,
trace_filename,
partitioned_graph_file_prefix,
profiler,
image_producer,
params,
summary_op=None,
show_images_per_sec=True,
benchmark_logger=None,
collective_graph_key=0,
should_output_files=True):
"""Advance one step of benchmarking."""
should_profile = profiler and 0 <= step < _NUM_STEPS_TO_PROFILE
need_options_and_metadata = (
should_profile or collective_graph_key > 0 or
((trace_filename or partitioned_graph_file_prefix) and step == -2)
)
if need_options_and_metadata:
run_options = tf.RunOptions()
if (trace_filename and step == -2) or should_profile:
run_options.trace_level = tf.RunOptions.FULL_TRACE
if partitioned_graph_file_prefix and step == -2:
run_options.output_partition_graphs = True
if collective_graph_key > 0:
run_options.experimental.collective_graph_key = collective_graph_key
run_metadata = tf.RunMetadata()
else:
run_options = None
run_metadata = None
summary_str = None
start_time = time.perf_counter()
if summary_op is None:
results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
else:
(results, summary_str) = sess.run(
[fetches, summary_op], options=run_options, run_metadata=run_metadata)
if not params.forward_only:
lossval = results['average_loss']
else:
lossval = 0.
if image_producer is not None:
image_producer.notify_image_consumption()
train_time = time.perf_counter() - start_time
step_train_times.append(train_time)
if (show_images_per_sec and step >= 0 and
(step == 0 or (step + 1) % params.display_every == 0)):
speed_mean, speed_uncertainty, speed_jitter = get_perf_timing(
batch_size, step_train_times, params.display_perf_ewma)
log_str = '%i\t%s\t%.*f' % (
step + 1,
get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter),
LOSS_AND_ACCURACY_DIGITS_TO_SHOW, lossval)
if 'top_1_accuracy' in results:
log_str += '\t%.*f\t%.*f' % (
LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_1_accuracy'],
LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_5_accuracy'])
log_fn(log_str)
if benchmark_logger:
benchmark_logger.log_metric(
'current_examples_per_sec', speed_mean, global_step=step + 1)
if 'top_1_accuracy' in results:
benchmark_logger.log_metric(
'top_1_accuracy', results['top_1_accuracy'], global_step=step + 1)
benchmark_logger.log_metric(
'top_5_accuracy', results['top_5_accuracy'], global_step=step + 1)
if need_options_and_metadata:
if should_profile:
profiler.add_step(step, run_metadata)
if trace_filename and step == -2 and should_output_files:
log_fn('Dumping trace to %s' % trace_filename)
trace_dir = os.path.dirname(trace_filename)
if not gfile.Exists(trace_dir):
gfile.MakeDirs(trace_dir)
with gfile.Open(trace_filename, 'w') as trace_file:
if params.use_chrome_trace_format:
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
trace_file.write(trace.generate_chrome_trace_format(show_memory=True))
else:
trace_file.write(str(run_metadata.step_stats))
if partitioned_graph_file_prefix and step == -2 and should_output_files:
path, filename = os.path.split(partitioned_graph_file_prefix)
if '.' in filename:
base_filename, ext = filename.rsplit('.', 1)
ext = '.' + ext
else:
base_filename, ext = filename, ''
as_text = filename.endswith('txt')
for graph_def in run_metadata.partition_graphs:
device = graph_def.node[0].device.replace('/', '_').replace(':', '_')
graph_filename = '%s%s%s' % (base_filename, device, ext)
log_fn('Writing partitioned GraphDef as %s to %s' % (
'text' if as_text else 'binary',
os.path.join(path, graph_filename)))
tf.train.write_graph(graph_def, path, graph_filename, as_text)
return (summary_str, lossval)
def get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter, scale=1):
if scale == 1:
# TODO(laigd): rename 'images' to maybe 'inputs', same below.
return ('images/sec: %.1f +/- %.1f (jitter = %.1f)' %
(speed_mean, speed_uncertainty, speed_jitter))
else:
return 'images/sec: %.1f' % speed_mean
def get_perf_timing(batch_size, step_train_times, ewma_alpha=None, scale=1):
"""Calculate benchmark processing speed."""
times = np.array(step_train_times)
speeds = batch_size / times
if ewma_alpha:
weights = np.logspace(len(times)-1, 0, len(times), base=1-ewma_alpha)
time_mean = np.average(times, weights=weights)
else:
time_mean = np.mean(times)
speed_mean = scale * batch_size / time_mean
speed_uncertainty = np.std(speeds) / np.sqrt(float(len(speeds)))
speed_jitter = 1.4826 * np.median(np.abs(speeds - np.median(speeds)))
return speed_mean, speed_uncertainty, speed_jitter
def load_checkpoint(saver, sess, ckpt_dir):
"""Loads checkpoint from provided directory or full path.
Args:
saver: Saver used to restore the checkpoint.
sess: TensorFlow session.
ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
Returns:
Global step.
"""
model_checkpoint_path = _get_checkpoint_to_load(ckpt_dir)
global_step = model_checkpoint_path.split('/')[-1].split('-')[-1]
if not global_step.isdigit():
global_step = 0
else:
global_step = int(global_step)
saver.restore(sess, model_checkpoint_path)
log_fn('Successfully loaded model from %s.' % model_checkpoint_path)
return global_step
def _get_checkpoint_to_load(ckpt_dir):
"""Returns which checkpoint to load.
Args:
ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
Returns:
Full path to checkpoint to load.
Raises:
CheckpointNotFoundException: If checkpoint is not found.
"""
p = re.compile(r'ckpt-\d+$')
if p.search(ckpt_dir):
model_checkpoint_path = ckpt_dir
else:
# Finds latest checkpoint in directory provided
ckpt = tf.train.get_checkpoint_state(ckpt_dir)
if ckpt and ckpt.model_checkpoint_path:
model_checkpoint_path = ckpt.model_checkpoint_path
else:
raise CheckpointNotFoundException('No checkpoint file found in dir:{}'.
format(ckpt_dir))
return model_checkpoint_path
# Params are passed to BenchmarkCNN's constructor. Params is a map from name
# to value, with one field per key in flags.param_specs.
#
# Call make_params() or make_params_from_flags() below to construct a Params
# tuple with default values from flags.param_specs, rather than constructing
# Params directly.
Params = namedtuple('Params', flags.param_specs.keys()) # pylint: disable=invalid-name
def validate_params(params):
"""Validates that the Params tuple had valid values.
When command-line flags are defined for each ParamSpec by calling
flags.define_flags(), calling this function is unnecessary because absl
already does flag validation. Otherwise, this function should be called.
Args:
params: A Params tuple.
Raises:
ValueError: An element of params had an invalid value.
"""
for name, value in params._asdict().items():
param_spec = flags.param_specs[name]
if param_spec.flag_type in ('integer', 'float'):
if (value is not None and param_spec.kwargs['lower_bound'] is not None and
value < param_spec.kwargs['lower_bound']):
raise ValueError('Param %s value of %s is lower than the lower bound '
'of %s' %
(name, value, param_spec.kwargs['lower_bound']))
if (value is not None and param_spec.kwargs['upper_bound'] is not None and
param_spec.kwargs['upper_bound'] < value):
raise ValueError('Param %s value of %s is higher than the upper bound '
'of %s' %
(name, value, param_spec.kwargs['upper_bound']))
elif (value is not None and param_spec.flag_type == 'enum' and
value not in param_spec.kwargs['enum_values']):
raise ValueError('Param %s of value %s is not in %s'%
(name, value, param_spec.kwargs['enum_values']))
def make_params(**kwargs):
"""Create a Params tuple for BenchmarkCNN from kwargs.
Default values are filled in from flags.param_specs.
Args:
**kwargs: kwarg values will override the default values.
Returns:
Params namedtuple for constructing BenchmarkCNN.
"""
# Create a (name: default_value) map from flags.param_specs.
default_kwargs = {
name: flags.param_specs[name].default_value
for name in flags.param_specs
}
params = Params(**default_kwargs)._replace(**kwargs)
validate_params(params)
return params
def make_params_from_flags():
"""Create a Params tuple for BenchmarkCNN from absl_flags.FLAGS.
Returns:
Params namedtuple for constructing BenchmarkCNN.
"""
# Collect (name: value) pairs for absl_flags.FLAGS with matching names in
# flags.param_specs.
flag_values = {name: getattr(absl_flags.FLAGS, name)
for name in flags.param_specs.keys()}
return Params(**flag_values)
def remove_param_fields(params, fields_to_remove):
"""Remove fields from a Params namedtuple."""
params_dict = params._asdict()
for field in fields_to_remove:
assert field in params_dict, 'Invalid Params field: ' + field
params_dict = {k: v for k, v in params_dict.items()
if k not in fields_to_remove}
new_params_type = namedtuple('Params', params_dict.keys())
return new_params_type(**params_dict)
def get_num_batches_and_epochs(params, batch_size, num_examples_per_epoch):
"""Returns the number of batches and epochs to run for.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
batch_size: The number of images per step.
num_examples_per_epoch: The number of images in a single epoch.
Returns:
num_batches: The number of batches to run for.
num_epochs: The number of epochs to run for. This might be slightly
smaller than params.num_epochs if specified, because the number of batches
must be an integer.
Raises:
ValueError: Invalid or unsupported params.
"""
if params.num_batches and params.num_epochs:
raise ValueError('At most one of --num_batches and --num_epochs may be '
'specified.')
if params.num_epochs:
num_batches = int(params.num_epochs * num_examples_per_epoch +
batch_size - 1) // batch_size
else:
num_batches = params.num_batches or _DEFAULT_NUM_BATCHES
num_epochs = num_batches * batch_size / num_examples_per_epoch
return (num_batches, num_epochs)
def get_piecewise_learning_rate(piecewise_learning_rate_schedule,
global_step, num_batches_per_epoch):
"""Returns a piecewise learning rate tensor.
Args:
piecewise_learning_rate_schedule: The --piecewise_learning_rate_schedule
parameter
global_step: Scalar tensor representing the global step.
num_batches_per_epoch: float indicating the number of batches per epoch.
Returns:
A scalar float tensor, representing the learning rate.
Raises:
ValueError: piecewise_learning_rate_schedule is not formatted correctly.
"""
pieces = piecewise_learning_rate_schedule.split(';')
if len(pieces) % 2 == 0:
raise ValueError('--piecewise_learning_rate_schedule must have an odd '
'number of components')
values = []
boundaries = []
for i, piece in enumerate(pieces):
if i % 2 == 0:
try:
values.append(float(piece))
except ValueError:
raise ValueError('Invalid learning rate: ' + piece)
else:
try:
boundaries.append(int(int(piece) * num_batches_per_epoch) - 1)
except ValueError:
raise ValueError('Invalid epoch: ' + piece)
return tf.train.piecewise_constant(global_step, boundaries, values,
name='piecewise_learning_rate')
def get_learning_rate(params, global_step, num_examples_per_epoch, model,
batch_size):
"""Returns a learning rate tensor based on global_step.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
global_step: Scalar tensor representing the global step.
num_examples_per_epoch: The number of examples per epoch.
model: The model.Model object to obtain the default learning rate from if no
learning rate is specified.
batch_size: Number of examples per step
Returns:
A scalar float tensor, representing the learning rate. When evaluated, the
learning rate depends on the current value of global_step.
Raises:
ValueError: Invalid or unsupported params.
"""
with tf.name_scope('learning_rate'):
num_batches_per_epoch = num_examples_per_epoch / batch_size
if params.piecewise_learning_rate_schedule:
if (params.init_learning_rate is not None or
params.learning_rate_decay_factor or
params.minimum_learning_rate or params.num_epochs_per_decay):
raise ValueError('No other learning rate-related flags can be '
'specified if --piecewise_learning_rate_schedule is '
'specified')
learning_rate = get_piecewise_learning_rate(
params.piecewise_learning_rate_schedule,
global_step, num_batches_per_epoch)
elif params.init_learning_rate is not None:
learning_rate = params.init_learning_rate
if (params.num_epochs_per_decay > 0 and
params.learning_rate_decay_factor > 0):
decay_steps = int(num_batches_per_epoch * params.num_epochs_per_decay)
# Decay the learning rate exponentially based on the number of steps.
learning_rate = tf.train.exponential_decay(
params.init_learning_rate,
global_step,
decay_steps,
params.learning_rate_decay_factor,
staircase=True)
if params.minimum_learning_rate != 0.:
learning_rate = tf.maximum(learning_rate,
params.minimum_learning_rate)
else:
learning_rate = model.get_learning_rate(global_step, batch_size)
if params.num_learning_rate_warmup_epochs > 0 and (
params.init_learning_rate is not None or
params.piecewise_learning_rate_schedule):
warmup_steps = int(num_batches_per_epoch *
params.num_learning_rate_warmup_epochs)
init_lr = params.init_learning_rate
if init_lr is None:
init_lr = float(params.piecewise_learning_rate_schedule.split(';')[0])
warmup_lr = init_lr * tf.cast(global_step, tf.float32) / tf.cast(
warmup_steps, tf.float32)
learning_rate = tf.cond(global_step < warmup_steps,
lambda: warmup_lr, lambda: learning_rate)
learning_rate = mlperf.logger.log_deferred_tensor_value(
mlperf.tags.OPT_LR, learning_rate, global_step, every_n=100)
return learning_rate
def get_optimizer(params, learning_rate):
"""Returns the optimizer that should be used based on params."""
if params.optimizer == 'momentum':
mlperf.logger.log(key=mlperf.tags.OPT_NAME,
value=mlperf.tags.SGD_WITH_MOMENTUM)
mlperf.logger.log(key=mlperf.tags.OPT_MOMENTUM, value=params.momentum)
opt = tf.train.MomentumOptimizer(
learning_rate, params.momentum, use_nesterov=True)
elif params.optimizer == 'sgd':
mlperf.logger.log(key=mlperf.tags.OPT_NAME, value=mlperf.tags.SGD)
opt = tf.train.GradientDescentOptimizer(learning_rate)
elif params.optimizer == 'rmsprop':
opt = tf.train.RMSPropOptimizer(
learning_rate,
params.rmsprop_decay,
momentum=params.rmsprop_momentum,
epsilon=params.rmsprop_epsilon)
elif params.optimizer == 'adam':
opt = tf.train.AdamOptimizer(learning_rate, params.adam_beta1,
params.adam_beta2, params.adam_epsilon)
else:
raise ValueError('Optimizer "{}" was not recognized'.
format(params.optimizer))
return opt
def generate_tfprof_profile(profiler, tfprof_file):
"""Generates a tfprof profile, writing it to a file and printing top ops.
Args:
profiler: A tf.profiler.Profiler. `profiler.add_step` must have already been
called.
tfprof_file: The filename to write the ProfileProto to.
"""
profile_proto = profiler.serialize_to_string()
log_fn('Dumping ProfileProto to %s' % tfprof_file)
with gfile.Open(tfprof_file, 'wb') as f:
f.write(profile_proto)
# Print out the execution times of the top operations. Note this
# information can also be obtained with the dumped ProfileProto, but
# printing it means tfprof doesn't have to be used if all the user wants
# is the top ops.
options = tf.profiler.ProfileOptionBuilder.time_and_memory()
options['max_depth'] = _NUM_OPS_TO_PRINT
options['order_by'] = 'accelerator_micros'
profiler.profile_operations(options)
class BenchmarkCNN(object):
"""Class for benchmarking a cnn network."""
def __init__(self, params, dataset=None, model=None):
"""Initialize BenchmarkCNN.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
dataset: If not None, the dataset to use. Otherwise, params is used to
obtain the dataset.
model: If not None, the model to use. Otherwise, params is used to obtain
the model.
Raises:
ValueError: Unsupported params settings.
"""
mlperf.logger.log(key=mlperf.tags.RUN_START)
self.params = params
if params.eval:
self._doing_eval = True
else:
# Note self._doing_eval can later switch to True in self._do_eval() if
# self.params.eval_during_training_* is specified.
self._doing_eval = False
self.dataset = dataset or datasets.create_dataset(self.params.data_dir,
self.params.data_name)
self.model = model or model_config.get_model_config(
self.params.model, self.dataset, self.params)
self.trace_filename = self.params.trace_file
self.rewriter_config = self.params.rewriter_config
autotune_threshold = self.params.autotune_threshold if (
self.params.autotune_threshold) else 1
min_autotune_warmup = 5 * autotune_threshold * autotune_threshold
self.num_warmup_batches = self.params.num_warmup_batches if (
self.params.num_warmup_batches is not None) else max(
10, min_autotune_warmup)
self.graph_file = self.params.graph_file
self.resize_method = self.params.resize_method
self.sync_queue_counter = 0
self.num_gpus = self.params.num_gpus
if self.params.gpu_indices:
self.gpu_indices = [int(x) for x in self.params.gpu_indices.split(',')]
else:
self.gpu_indices = [x for x in range(self.num_gpus)]
if (self.params.device == 'cpu' and self.params.data_format == 'NCHW' and
not self.params.mkl):
raise ValueError('device=cpu requires that data_format=NHWC')
if ((self.params.num_epochs_per_decay or
self.params.learning_rate_decay_factor) and
not (self.params.init_learning_rate is not None and
self.params.num_epochs_per_decay
and self.params.learning_rate_decay_factor)):
raise ValueError('If one of num_epochs_per_decay or '
'learning_rate_decay_factor is set, both must be set'
'and learning_rate must be set')
if (self.params.minimum_learning_rate and
not (self.params.init_learning_rate is not None and
self.params.num_epochs_per_decay and
self.params.learning_rate_decay_factor)):
raise ValueError('minimum_learning_rate requires learning_rate,'
'num_epochs_per_decay, and '
'learning_rate_decay_factor to be set')
if (self.params.use_fp16 and self.params.fp16_vars and
'replicated' in self.params.variable_update and
self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec):
raise ValueError('fp16 variables are not supported with NCCL')
if (self.params.use_fp16 and self.params.fp16_vars and
self.params.gradient_repacking):
raise ValueError('--fp16_vars cannot be used with --gradient_repacking')
if self.params.variable_update == 'horovod' and self.params.num_gpus > 1:
raise ValueError('Horovod benchmarks require num_gpus=1 on each worker')
if self.params.variable_update == 'horovod' and self.params.job_name:
raise ValueError('job_name should not be specified for Horovod.')
if self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale:
if self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec:
raise ValueError('Automatic loss scaling is not supported with NCCL.')
if self.params.variable_update not in ('parameter_server', 'replicated',
'independent'):
raise ValueError('Automatic loss scaling is not supported with '
'variable_update=%s.' % self.params.variable_update)
if self.params.staged_vars:
raise ValueError('Automatic loss scaling is not supported with'
'staged_vars.')
if (self.params.debugger is not None and self.params.debugger != 'cli' and
':' not in self.params.debugger):
raise ValueError('--debugger must be "cli" or in the form '
'host:port')
if self.params.hierarchical_copy and self.params.num_gpus <= 1:
raise ValueError('--hierarchical_copy requires --num_gpus to be greater '
'than 1')
if params.save_model_secs and params.save_model_steps:
raise ValueError('At most one of --save_model_secs and '
'--save_model_steps can be specified')
eval_during_training_flags = list(map(bool, [
params.eval_during_training_every_n_steps,
params.eval_during_training_every_n_epochs,
params.eval_during_training_at_specified_steps,
params.eval_during_training_at_specified_epochs,
]))
if eval_during_training_flags.count(True) > 1:
raise ValueError('At most one flag with --eval_during_training_* prefix '
'must be specified.')
eval_during_training_enabled = any(eval_during_training_flags)
if eval_during_training_enabled:
if params.eval:
raise ValueError('At most one of --eval and --eval_during_training_* '
'must be specified')
if params.forward_only:
raise ValueError('At most one of --forward_only and '
'--eval_during_training_* must be specified')
if params.job_name:
raise ValueError('--eval_during_training_* is not yet supported in '
'distributed mode.')
if params.staged_vars:
raise ValueError('--eval_during_training_* is not currently compatible '
'with --staged_vars')
if params.stop_at_top_1_accuracy and not eval_during_training_enabled:
raise ValueError('--stop_at_top_1_accuracy is only supported with '
'--eval_during_training_*')
if params.collect_eval_results_async and params.model != 'ssd300':
raise ValueError('--collect_eval_results_async only works with ssd300 '
'model currently.')
if self.params.forward_only and self.params.freeze_when_forward_only:
if self.params.train_dir is not None:
raise ValueError('In forward_only mode, when --freeze_when_forward_only'
' is True, --train_dir should not be specified')
if self.params.data_dir and not self.params.datasets_use_prefetch:
raise ValueError('In forward_only mode, when --freeze_when_forward_only'
' is True and --data_dir is set, '
'--datasets_use_prefetch should be set to True')
if self.params.job_name:
raise ValueError('In forward_only mode, when --freeze_when_forward_only'
' is True, --job_name should not be specified and '
'distributed running is not supported')
self.forward_only_and_freeze = True
else:
self.forward_only_and_freeze = False
if self.params.trt_mode:
raise ValueError('--trt_mode should not be specified if one of '
'--forward_only and --freeze_when_forward_only is set '
'to False')
self.mode = get_mode_from_params(self.params)
# Use the batch size from the command line if specified, otherwise use the
# model's default batch size. Scale the benchmark's batch size by the
# number of GPUs.
if self.params.batch_size > 0:
self.model.set_batch_size(self.params.batch_size)
self.batch_size = self.model.get_batch_size() * self.num_gpus
if self.mode in (constants.BenchmarkMode.TRAIN,
constants.BenchmarkMode.TRAIN_AND_EVAL):
self.train_batch_size = self.batch_size
else:
self.train_batch_size = None
if self.mode in (constants.BenchmarkMode.EVAL,
constants.BenchmarkMode.TRAIN_AND_EVAL):
if self.params.eval_batch_size > 0:
self.eval_batch_size = self.params.eval_batch_size * self.num_gpus
else:
self.eval_batch_size = self.batch_size
else:
self.eval_batch_size = None
self.batch_group_size = self.params.batch_group_size
self.enable_auto_loss_scale = (
self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale)
self.loss_scale = None
self.loss_scale_normal_steps = None
self.job_name = self.params.job_name # "" for local training
# PS server is used for distributed jobs not using all-reduce.
use_ps_server = self.job_name and (self.params.variable_update !=
'distributed_all_reduce' and
self.params.variable_update !=
'collective_all_reduce')
# controller is used for distributed_all_reduce with > 1 worker.
use_controller = (
self.params.variable_update == 'distributed_all_reduce' and
self.job_name)
if use_controller and not params.controller_host:
raise ValueError('When variable_update==distributed_all_reduce '
'controller_host must also be specified.')
self.single_session = (
self.params.variable_update == 'distributed_all_reduce')
# collective_all_reduce doesn't need a controller or ps
self.distributed_collective = (
self.params.variable_update == 'collective_all_reduce' and
self.job_name)
self.local_parameter_device_flag = self.params.local_parameter_device
if self.job_name:
self.task_index = self.params.task_index
self.cluster_manager = platforms_util.get_cluster_manager(
params, create_config_proto(params))
assert isinstance(self.cluster_manager, cnn_util.BaseClusterManager)
worker_prefix = '/job:worker/replica:0/task:%s' % self.task_index
if use_ps_server:
self.param_server_device = tf.train.replica_device_setter(
worker_device=worker_prefix + '/cpu:0',
cluster=self.cluster_manager.get_cluster_spec())
# This device on which the queues for managing synchronization between
# servers should be stored.
self.sync_queue_devices = [
'/job:ps/replica:0/task:%s/cpu:0' % i
for i in range(self.cluster_manager.num_ps())
]
else:
self.sync_queue_devices = ['/job:worker/replica:0/task:0/cpu:0']
else:
self.task_index = 0
self.cluster_manager = None
worker_prefix = ''
self.param_server_device = '/%s:0' % self.params.local_parameter_device
self.sync_queue_devices = [self.param_server_device]
if self.cluster_manager:
self.num_workers = self.cluster_manager.num_workers()
elif self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
self.num_workers = hvd.size()
else:
self.num_workers = 1
self.num_ps = self.cluster_manager.num_ps() if self.cluster_manager else 0
if self.num_workers > 1 and self.params.all_reduce_spec == 'nccl':
raise ValueError('--all_reduce_spec=nccl is invalid in a '
'multi-worker job')
# Device to use for ops that need to always run on the local worker's CPU.
self.cpu_device = '%s/cpu:0' % worker_prefix
# Device to use for ops that need to always run on the local worker's
# compute device, and never on a parameter server device.
self.raw_devices = [
'%s/%s:%i' % (worker_prefix, self.params.device, i)
for i in xrange(self.num_gpus)
]
subset = 'validation' if params.eval else 'train'
self.num_batches, self.num_epochs = get_num_batches_and_epochs(
params, self.batch_size * self.num_workers,
self.dataset.num_examples_per_epoch(subset))
if self.mode in (constants.BenchmarkMode.EVAL,
constants.BenchmarkMode.TRAIN_AND_EVAL):
# TODO(reedwm): Currently we do extra eval logic for num_eval_batches and
# the preprocessor. We should encapsulate this logic into a shared
# function or class.
if params.num_eval_batches is None and params.num_eval_epochs is None:
eval_params = self.params
else:
eval_params = self.params._replace(
num_batches=self.params.num_eval_batches,
num_epochs=self.params.num_eval_epochs)
self.num_eval_batches, self.num_eval_epochs = get_num_batches_and_epochs(
eval_params, self.eval_batch_size * self.num_workers,
self.dataset.num_examples_per_epoch('validation'))
else:
self.num_eval_batches, self.num_eval_epochs = None, None
num_train_examples_per_epoch = self.dataset.num_examples_per_epoch('train')
if self.params.eval_during_training_every_n_epochs:
n_epochs = self.params.eval_during_training_every_n_epochs
self.eval_during_training_at_specified_steps = {
(int(e * num_train_examples_per_epoch + self.batch_size - 1) //
self.batch_size)
for e in np.arange(n_epochs, self.num_epochs, n_epochs)}
if self.params.eval_during_training_at_specified_steps:
try:
self.eval_during_training_at_specified_steps = set(map(
int, self.params.eval_during_training_at_specified_steps))
except ValueError:
raise ValueError('Param eval_during_training_at_specified_steps value '
'of %s cannot be converted to a list of integers.' %
(self.params.eval_during_training_at_specified_steps))
if self.params.eval_during_training_at_specified_epochs:
try:
n_epochs = list(map(
float, self.params.eval_during_training_at_specified_epochs))
offset = n_epochs[0] - 1
if offset.is_integer():
offset = int(offset)
mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
self.eval_during_training_at_specified_steps = {
(int(e * num_train_examples_per_epoch + self.batch_size - 1) //
self.batch_size)
for e in n_epochs}
except ValueError:
raise ValueError('Param eval_during_training_at_specified_epochs value '
'of %s cannot be converted to a list of floats.' %
(self.params.eval_during_training_at_specified_epochs))
if params.eval_during_training_every_n_epochs:
offset = params.eval_during_training_every_n_epochs - 1
if offset.is_integer():
offset = int(offset)
mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
if (self.params.staged_vars and
self.params.variable_update != 'parameter_server'):
raise ValueError('staged_vars for now is only supported with '
'variable_update=parameter_server')
if self.params.variable_update == 'parameter_server':
if self.job_name:
if not self.params.staged_vars:
self.variable_mgr = variable_mgr.VariableMgrDistributedFetchFromPS(
self)
else:
self.variable_mgr = (
variable_mgr.VariableMgrDistributedFetchFromStagedPS(self))
else:
if not self.params.staged_vars:
self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(self)
else:
self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromStagedPS(
self)
elif self.params.variable_update == 'replicated':
if self.job_name:
raise ValueError('Invalid variable_update in distributed mode: %s' %
self.params.variable_update)
self.variable_mgr = variable_mgr.VariableMgrLocalReplicated(
self, self.params.all_reduce_spec,
self.params.agg_small_grads_max_bytes,
self.params.agg_small_grads_max_group,
self.params.allreduce_merge_scope)
elif self.params.variable_update == 'distributed_all_reduce':
assert self.params.cross_replica_sync
self.variable_mgr = variable_mgr.VariableMgrDistributedAllReduce(
self, self.params.all_reduce_spec,
('worker' if self.num_workers > 1 else 'localhost'),
self.num_workers, self.params.agg_small_grads_max_bytes,
self.params.agg_small_grads_max_group,
self.params.allreduce_merge_scope)
elif self.params.variable_update == 'collective_all_reduce':
assert self.params.cross_replica_sync
self.variable_mgr = variable_mgr.VariableMgrCollectiveAllReduce(
self, self.params.all_reduce_spec,
self.num_workers, self.num_gpus, self.task_index,
self.params.allreduce_merge_scope)
elif self.params.variable_update == 'distributed_replicated':
assert self.params.cross_replica_sync
if not self.job_name:
raise ValueError('Invalid variable_update in local mode: %s' %
self.params.variable_update)
self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated(self)
elif self.params.variable_update in ('independent', 'horovod'):
if self.job_name:
raise ValueError('Invalid variable_update in distributed mode: %s' %
self.params.variable_update)
self.variable_mgr = variable_mgr.VariableMgrIndependent(self)
else:
raise ValueError(
'Invalid variable_update: %s' % self.params.variable_update)
# Device to use for running on the local worker's compute device, but
# with variables assigned to parameter server devices.
self.devices = self.variable_mgr.get_devices()
if self.job_name:
if use_ps_server:
self.global_step_device = self.param_server_device
elif self.params.variable_update == 'collective_all_reduce':
self.global_step_device = self.cpu_device
else:
self.global_step_device = '/job:worker/replica:0/task:0/cpu:0'
else:
self.global_step_device = self.cpu_device
self.input_preprocessor = None
self.eval_input_preprocessor = None
if not self.dataset.use_synthetic_gpu_inputs():
if not self.params.eval:
self.input_preprocessor = self.get_input_preprocessor()
if self.mode in (constants.BenchmarkMode.EVAL,
constants.BenchmarkMode.TRAIN_AND_EVAL):
with self._do_eval():
self.eval_input_preprocessor = self.get_input_preprocessor()
self.datasets_use_prefetch = (
self.params.datasets_use_prefetch and
# TODO(rohanj): Figure out why --datasets_use_prefetch freezes on the
# CPU.
self.params.device.lower() != 'cpu' and
self.input_preprocessor and
self.input_preprocessor.supports_datasets())
self.init_global_step = 0
self._config_benchmark_logger()
if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
# Remove "eval" from params so it is not accidentally used. Since eval can
# still occur despite params.eval being False, params.eval should never
# be used. We cannot yet remove this unconditionally, because the SSD
# model still uses params.eval, and hence does not work properly with
# --eval_during_training_*.
# TODO(b/116627045): We should also remove fields that have an eval
# equivalent, like num_batches and num_eval_batches.
self.params = remove_param_fields(self.params, {'eval'})
@contextlib.contextmanager
def _do_eval(self):
"""Context manager to switches BenchmarkCNN to eval mode.
Any evaluation code should be put under this context manager. This context
manager switches self._doing_eval to True. It also switches certain
attributes, like self.num_batches and self.num_epochs, to be the number of
batches and epochs for evaluation respectively
Yields:
Nothing.
"""
# TODO(b/116627045): Find a more general way of switching attributes to the
# eval equivalents.
old_doing_eval = self._doing_eval
old_num_batches = self.num_batches
old_num_epochs = self.num_epochs
old_batch_size = self.batch_size
try:
self._doing_eval = True
self.num_batches = self.num_eval_batches
self.num_epochs = self.num_eval_epochs
self.batch_size = self.eval_batch_size
self.model.set_batch_size(self.eval_batch_size // self.num_gpus)
yield
finally:
self._doing_eval = old_doing_eval
self.num_batches = old_num_batches
self.num_epochs = old_num_epochs
self.batch_size = old_batch_size
self.model.set_batch_size(old_batch_size // self.num_gpus)
def _config_benchmark_logger(self):
"""Config the model garden benchmark logger."""
model_benchmark_logger = None
if self.params.benchmark_log_dir is not None:
try:
from official.r1.utils.logs import logger as models_logger # pylint: disable=g-import-not-at-top
except ImportError:
tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH '
'in order to use BenchmarkLogger. Configured '
'benchmark_log_dir: %s'
% self.params.benchmark_log_dir)
raise
model_benchmark_logger = models_logger.BenchmarkFileLogger(
self.params.benchmark_log_dir)
self.benchmark_logger = model_benchmark_logger
# TODO(laigd): this changes the global device list which is used everywhere,
# consider refactoring it.
def reset_devices_for_task(self, task_num, is_local=False):
"""Used to imitate another task when building a distributed graph."""
worker_prefix = ('/job:localhost' if is_local else
'/job:worker/replica:0/task:%s' % task_num)
self.cpu_device = '%s/cpu:0' % worker_prefix
self.raw_devices = [
'%s/%s:%i' % (worker_prefix, self.params.device, i)
for i in xrange(self.num_gpus)
]
self.devices = self.variable_mgr.get_devices()
def raw_devices_across_tasks(self, is_local=False):
"""Returns list of raw device names across all tasks."""
if is_local:
assert self.num_workers == 1
return self.raw_devices
else:
return [
'job:worker/replica:0/task%s/%s:%i' % (t, self.params.device, i)
for t in xrange(self.num_workers)
for i in xrange(self.num_gpus)
]
def print_info(self):
"""Print basic information."""
benchmark_info = self._get_params_info()
log_fn('Model: %s' % self.model.get_model_name())
log_fn('Dataset: %s' % benchmark_info['dataset_name'])
log_fn('Mode: %s' % self.mode)
log_fn('SingleSess: %s' % benchmark_info['single_session'])
log_fn('Batch size: %s global' % (self.batch_size * self.num_workers))
log_fn(' %s per device' % (self.batch_size //
len(self.raw_devices)))
if self.batch_group_size > 1:
log_fn(' %d batches per prepocessing group' %
self.batch_group_size)
log_fn('Num batches: %d' % self.num_batches)
log_fn('Num epochs: %.2f' % self.num_epochs)
log_fn('Devices: %s' % benchmark_info['device_list'])
log_fn('NUMA bind: %s' % self.params.use_numa_affinity)
log_fn('Data format: %s' % self.params.data_format)
if self.rewriter_config:
log_fn('RewriterConfig: %s' % self.rewriter_config)
log_fn('Optimizer: %s' % self.params.optimizer)
log_fn('Variables: %s' % self.params.variable_update)
if (self.params.variable_update == 'replicated' or
self.params.variable_update == 'distributed_all_reduce'
or self.params.variable_update == 'collective_all_reduce'):
log_fn('AllReduce: %s' % self.params.all_reduce_spec)
if self.job_name:
log_fn('Sync: %s' % self.params.cross_replica_sync)
if self.params.staged_vars:
log_fn('Staged vars: %s' % self.params.staged_vars)
if self.params.variable_update == 'horovod' and self.params.horovod_device:
log_fn('Horovod on: %s' % self.params.horovod_device)
log_fn('==========')
def _get_params_info(self):
"""Get the common parameters info for the benchmark run.
Returns:
A dict of processed parameters.
"""
dataset_name = self.dataset.name
if self.dataset.use_synthetic_gpu_inputs():
dataset_name += ' (synthetic)'
single_session = self.params.variable_update == 'distributed_all_reduce'
if single_session:
device_list = self.raw_devices_across_tasks()
elif self.params.variable_update == 'horovod':
device_list = ['horovod/%s:%d' % (self.params.device, idx)
for idx in range(self.num_workers)]
else:
device_list = self.raw_devices
return {
'dataset_name': dataset_name,
'single_session': single_session,
'device_list': device_list,}
def _log_benchmark_run(self):
"""Log the benchmark info to the logger.
The info logged here should be similar to print_info(), but in a structured
JSON format.
"""
if self.benchmark_logger:
benchmark_info = self._get_params_info()
run_param = {
'model': self.model.get_model_name(),
'dataset': benchmark_info['dataset_name'],
'mode': self.mode,
'single_sess': benchmark_info['single_session'],
'devices': benchmark_info['device_list'],
'batch_size': self.batch_size,
'batch_size_per_device': self.batch_size // len(self.raw_devices),
'num_batches': self.num_batches,
'num_epochs': self.num_epochs,
'data_format': self.params.data_format,
'rewrite_config': self.rewriter_config,
'optimizer': self.params.optimizer,
'session_config': create_config_proto(self.params),
}
# TODO(scottzhu): tf_cnn_benchmark might execute several times with
# different param setting on the same box. This will cause the run file to
# only contain the latest info. The benchmark_log_dir should be updated
# for every new run.
self.benchmark_logger.log_run_info(
self.model.get_model_name(), benchmark_info['dataset_name'],
run_param, test_id=self.params.benchmark_test_id)
def run(self):
"""Run the benchmark task assigned to this process.
Returns:
Dictionary of statistics for training or eval.
Raises:
ValueError: unrecognized job name.
"""
if self.params.job_name == 'ps':
log_fn('Running parameter server %s' % self.task_index)
self.cluster_manager.join_server()
return {}
# For distributed_all_reduce with multiple workers, drive
# from a separate controller process.
if self.params.variable_update == 'distributed_all_reduce':
if self.params.job_name == 'worker':
log_fn('Starting worker %s' % self.task_index)
self.cluster_manager.join_server()
return
elif self.params.job_name and self.params.job_name != 'controller':
raise ValueError('unrecognized job name: %s' % self.params.job_name)
self._log_benchmark_run()
if self._doing_eval:
with tf.Graph().as_default():
# TODO(laigd): freeze the graph in eval mode.
return self._run_eval()
else:
return self._benchmark_train()
def _run_eval(self):
"""Evaluate a model every self.params.eval_interval_secs.
Returns:
Dictionary containing eval statistics. Currently returns an empty
dictionary.
Raises:
ValueError: If self.params.train_dir is unspecified.
"""
if self.params.train_dir is None:
raise ValueError('Trained model directory not specified')
graph_info = self._build_eval_graph()
saver = tf.train.Saver(self.variable_mgr.savable_variables())
summary_writer = tf.summary.FileWriter(self.params.eval_dir,
tf.get_default_graph())
target = ''
# TODO(huangyp): Check if checkpoints haven't updated for hours and abort.
while True:
with tf.Session(
target=target, config=create_config_proto(self.params)) as sess:
image_producer = None
try:
global_step = load_checkpoint(saver, sess, self.params.train_dir)
image_producer = self._initialize_eval_graph(
graph_info.enqueue_ops, graph_info.input_producer_op,
graph_info.local_var_init_op_group, sess)
except CheckpointNotFoundException:
log_fn('Checkpoint not found in %s' % self.params.train_dir)
else: # Only executes if an exception was not thrown
self._eval_once(sess, summary_writer, graph_info.fetches,
graph_info.summary_op, image_producer, global_step)
if image_producer is not None:
image_producer.done()
if self.params.eval_interval_secs <= 0:
break
time.sleep(self.params.eval_interval_secs)
return {}
def _build_eval_graph(self, scope_name=None):
"""Build the evaluation graph.
Args:
scope_name: String to filter what summaries are collected. Only summary
ops whose name contains `scope_name` will be added, which is useful for
only including evaluation ops.
Returns:
A GraphInfo named_tuple containing various useful ops and tensors of the
evaluation grpah.
"""
with self._do_eval():
input_producer_op, enqueue_ops, fetches = self._build_model()
local_var_init_op = tf.local_variables_initializer()
table_init_ops = tf.tables_initializer()
variable_mgr_init_ops = [local_var_init_op]
if table_init_ops:
variable_mgr_init_ops.extend([table_init_ops])
with tf.control_dependencies([local_var_init_op]):
variable_mgr_init_ops.extend(self.variable_mgr.get_post_init_ops())
local_var_init_op_group = tf.group(*variable_mgr_init_ops)
summary_op = tf.summary.merge_all(scope=scope_name)
# The eval graph has no execution barrier because it doesn't run in
# distributed mode.
execution_barrier = None
# We do not use the global step during evaluation.
global_step = None
return GraphInfo(input_producer_op, enqueue_ops, fetches,
execution_barrier, global_step, local_var_init_op_group,
summary_op)
# TODO(reedwm): For consistency, we should have a similar
# "_initialize_train_graph" function. They can likely be the same function.
def _initialize_eval_graph(self, enqueue_ops, input_producer_op,
local_var_init_op_group, sess):
"""Initializes the evaluation graph.
Args:
enqueue_ops: Ops that adds the preprocessed images to the staging areas.
input_producer_op: Op that produce the input batches (before
preprocessing).
local_var_init_op_group: Group of ops that perform per-device
initialization work.
sess: The session to initialize the eval graph with.
Returns:
An ImageProducer, or None if an ImageProducer isn't being used.
"""
with self._do_eval():
if local_var_init_op_group is not None:
# We might reinitialize local variables if they were already initialized
# during training. This is OK.
sess.run(local_var_init_op_group)
if self.dataset.queue_runner_required():
tf.train.start_queue_runners(sess=sess)
image_producer = None
if input_producer_op is not None:
image_producer = cnn_util.ImageProducer(
sess, input_producer_op, self.batch_group_size,
self.params.use_python32_barrier)
image_producer.start()
if enqueue_ops:
for i in xrange(len(enqueue_ops)):
sess.run(enqueue_ops[:(i + 1)])
if image_producer is not None:
image_producer.notify_image_consumption()
return image_producer
def _eval_once(self, sess, summary_writer, fetches, summary_op,
image_producer, global_step):
"""Evaluate the model using the validation dataset."""
with self._do_eval():
mlperf.logger.log_eval_epoch(
mlperf.tags.EVAL_START, global_step, self.batch_size)
loop_start_time = start_time = time.perf_counter()
# TODO(laigd): refactor the part to compute/report the accuracy. Currently
# it only works for image models.
top_1_accuracy_sum = 0.0
top_5_accuracy_sum = 0.0
total_eval_count = self.num_batches * self.batch_size
for step in xrange(self.num_batches):
if (summary_writer and self.params.save_summaries_steps > 0 and
(step + 1) % self.params.save_summaries_steps == 0):
results, summary_str = sess.run([fetches, summary_op])
summary_writer.add_summary(summary_str)
else:
results = sess.run(fetches)
# Make global_step available in results for postprocessing.
results['global_step'] = global_step
results = self.model.postprocess(results)
top_1_accuracy_sum += results['top_1_accuracy']
top_5_accuracy_sum += results['top_5_accuracy']
if (step + 1) % self.params.display_every == 0:
duration = time.perf_counter() - start_time
examples_per_sec = (
self.batch_size * self.params.display_every / duration)
log_fn('%i\t%.1f examples/sec' % (step + 1, examples_per_sec))
start_time = time.perf_counter()
if image_producer is not None:
image_producer.notify_image_consumption()
loop_end_time = time.perf_counter()
accuracy_at_1 = top_1_accuracy_sum / self.num_batches
accuracy_at_5 = top_5_accuracy_sum / self.num_batches
summary = tf.Summary()
summary.value.add(tag='eval/Accuracy@1', simple_value=accuracy_at_1)
summary.value.add(tag='eval/Accuracy@5', simple_value=accuracy_at_5)
for result_key, result_value in results.items():
if result_key.startswith(constants.SIMPLE_VALUE_RESULT_PREFIX):
prefix_len = len(constants.SIMPLE_VALUE_RESULT_PREFIX)
summary.value.add(tag='eval/' + result_key[prefix_len:],
simple_value=result_value)
if summary_writer:
summary_writer.add_summary(summary, global_step)
log_fn('Accuracy @ 1 = %.4f Accuracy @ 5 = %.4f [%d examples]' %
(accuracy_at_1, accuracy_at_5, total_eval_count))
elapsed_time = loop_end_time - loop_start_time
images_per_sec = (self.num_batches * self.batch_size / elapsed_time)
if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
# Note that we compute the top 1 accuracy and top 5 accuracy for each
# batch, which will have a slight performance impact.
log_fn('-' * 64)
log_fn('total images/sec: %.2f' % images_per_sec)
log_fn('-' * 64)
if self.benchmark_logger:
eval_result = {
'eval_top_1_accuracy', accuracy_at_1,
'eval_top_5_accuracy', accuracy_at_5,
'eval_average_examples_per_sec', images_per_sec,
tf.GraphKeys.GLOBAL_STEP, global_step,
}
self.benchmark_logger.log_evaluation_result(eval_result)
mlperf.logger.log_eval_epoch(
mlperf.tags.EVAL_STOP, global_step, self.batch_size)
mlperf.logger.log(key=mlperf.tags.EVAL_SIZE,
value=self.num_batches * self.batch_size)
if self.params.model != 'ssd300': # ssd300 logs eval accuracy elsewhere.
mlperf.logger.log_eval_accuracy(
accuracy_at_1, global_step, self.train_batch_size,
examples_per_epoch=self.dataset.num_examples_per_epoch('train'))
if self.params.stop_at_top_1_accuracy:
mlperf.logger.log(key=mlperf.tags.EVAL_TARGET,
value=self.params.stop_at_top_1_accuracy)
return accuracy_at_1, accuracy_at_5
def _benchmark_train(self):
"""Run cnn in benchmark mode. Skip the backward pass if forward_only is on.
Returns:
Dictionary containing training statistics (num_workers, num_steps,
average_wall_time, images_per_sec).
"""
graph = tf.Graph()
with graph.as_default():
build_result = self._build_graph()
if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
with self.variable_mgr.reuse_variables():
with tf.name_scope('Evaluation') as ns:
eval_build_results = self._build_eval_graph(ns)
else:
eval_build_results = None
(graph, result_to_benchmark) = self._preprocess_graph(graph, build_result)
with graph.as_default():
return self._benchmark_graph(result_to_benchmark, eval_build_results)
GPU_CACHED_INPUT_VARIABLE_NAME = 'gpu_cached_inputs'
def _unfreezable_local_variables(self, graph):
"""Get the local variables that we don't want to freeze."""
return graph.get_collection(
tf.GraphKeys.LOCAL_VARIABLES,
# We don't freeze the gpu_cached_images local variable so it won't get
# constant folded with ops which process the input.
scope='.*' + BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME)
def _build_graph(self):
"""Build the graph.
Returns:
A namedtuple containing the ops/tensors that required by
_benchmark_graph().
"""
if self.single_session:
(input_producer_op, enqueue_ops, fetches) = (
self._build_model_single_session())
else:
(input_producer_op, enqueue_ops, fetches) = self._build_model()
fetches_list = nest.flatten(list(fetches.values()))
main_fetch_group = tf.group(*fetches_list, name='main_fetch_group')
execution_barrier = None
if (not self.single_session and self.job_name and
not self.params.cross_replica_sync):
execution_barrier = self.add_sync_queues_and_barrier(
'execution_barrier_', [])
global_step = tf.train.get_global_step()
with tf.device(self.global_step_device), tf.name_scope('inc_global_step'):
with tf.control_dependencies([main_fetch_group]):
fetches['inc_global_step'] = global_step.assign_add(1)
if ((not self.single_session) and (not self.distributed_collective) and
self.job_name and self.params.cross_replica_sync):
# Block all replicas until all replicas are ready for next step.
fetches['sync_queues'] = self.add_sync_queues_and_barrier(
'sync_queues_step_end_', [main_fetch_group])
# Skips the init ops for freezable local variables in forward_only mode so
# we can remove all the assign ops when converting variables to constants.
with tf.name_scope('local_variable_initialization'):
if self.forward_only_and_freeze:
local_var_init_op = tf.variables_initializer(
self._unfreezable_local_variables(tf.get_default_graph()))
else:
local_var_init_op = tf.local_variables_initializer()
table_init_ops = tf.tables_initializer()
variable_manager_init_ops = [local_var_init_op]
if table_init_ops:
variable_manager_init_ops.extend([table_init_ops])
if not self.forward_only_and_freeze:
with tf.control_dependencies([local_var_init_op]):
variable_manager_init_ops.extend(self.variable_mgr.get_post_init_ops())
if ((not self.single_session) and (not self.distributed_collective) and
self.job_name and self.params.cross_replica_sync):
# Ensure all workers execute variable_manager_init_ops before they start
# executing the model.
variable_manager_init_ops.append(
self.add_sync_queues_and_barrier('init_ops_end_',
variable_manager_init_ops))
local_var_init_op_group = tf.group(*variable_manager_init_ops,
name='local_var_init_op_group')
summary_op = tf.summary.merge_all()
return GraphInfo(
input_producer_op=input_producer_op,
enqueue_ops=enqueue_ops,
fetches=fetches,
execution_barrier=execution_barrier,
global_step=global_step,
local_var_init_op_group=local_var_init_op_group,
summary_op=summary_op)
def _benchmark_graph(self, graph_info, eval_graph_info):
"""Benchmark the training graph.
Args:
graph_info: the namedtuple returned by _build_graph() which
contains all necessary information to benchmark the graph, including
named tensors/ops list, fetches, etc.
eval_graph_info: Similar to graph_info but for the eval graph if
--eval_during_training_* is used. Otherwise, None.
Returns:
Dictionary containing training statistics (num_workers, num_steps,
average_wall_time, images_per_sec).
"""
log_fn('Initializing graph')
if self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
# First worker will be 'chief' - it will write summaries and
# save checkpoints.
is_chief = hvd.rank() == 0
else:
is_chief = (not self.job_name or self.task_index == 0)
summary_writer = None
if (is_chief and self.params.summary_verbosity and self.params.train_dir and
self.params.save_summaries_steps > 0):
summary_writer = tf.summary.FileWriter(self.params.train_dir,
tf.get_default_graph())
# We want to start the benchmark timer right after a image_producer barrier
# and avoids undesired waiting times on barriers.
if ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
self.batch_group_size) != 0:
self.num_warmup_batches = int(
math.ceil(
(self.num_warmup_batches + len(graph_info.enqueue_ops) - 1.0) /
(self.batch_group_size)) * self.batch_group_size -
len(graph_info.enqueue_ops) + 1)
log_fn('Round up warm up steps to %d to match batch_group_size' %
self.num_warmup_batches)
assert ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
self.batch_group_size) == 0
# We run the summaries in the same thread as the training operations by
# passing in None for summary_op to avoid a summary_thread being started.
# Running summaries and training operations in parallel could run out of
# GPU memory.
if is_chief and not self.forward_only_and_freeze:
saver = tf.train.Saver(
self.variable_mgr.savable_variables(),
save_relative_paths=True,
max_to_keep=self.params.max_ckpts_to_keep)
else:
saver = None
ready_for_local_init_op = None
if self.job_name and not (self.single_session or
self.distributed_collective):
# In distributed mode, we don't want to run local_var_init_op_group until
# the global variables are initialized, because local_var_init_op_group
# may use global variables (such as in distributed replicated mode). We
# don't set this in non-distributed mode, because in non-distributed mode,
# local_var_init_op_group may itself initialize global variables (such as
# in replicated mode).
ready_for_local_init_op = tf.report_uninitialized_variables(
tf.global_variables())
if self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
bcast_global_variables_op = hvd.broadcast_global_variables(0)
else:
bcast_global_variables_op = None
if self.params.variable_update == 'collective_all_reduce':
# It doesn't matter what this collective_graph_key value is,
# so long as it's > 0 and the same at every worker.
init_run_options = tf.RunOptions()
init_run_options.experimental.collective_graph_key = 6
else:
init_run_options = tf.RunOptions()
local_var_init_ops = [graph_info.local_var_init_op_group]
if eval_graph_info:
# `eval_graph_info.local_var_init_op_group` also includes some of the
# training initializer ops, since it's difficult to filter them out.
# Rerunning the training initializer ops is OK, but we add a control
# dependency since running two sets of training initializer ops at the
# same time can cause race conditions.
with tf.control_dependencies(local_var_init_ops):
local_var_init_ops.append(eval_graph_info.local_var_init_op_group)
sv = tf.train.Supervisor(
# For the purpose of Supervisor, all Horovod workers are 'chiefs',
# since we want session to be initialized symmetrically on all the
# workers.
is_chief=is_chief or (self.params.variable_update == 'horovod'
or self.distributed_collective),
# Log dir should be unset on non-chief workers to prevent Horovod
# workers from corrupting each other's checkpoints.
logdir=self.params.train_dir if is_chief else None,
ready_for_local_init_op=ready_for_local_init_op,
local_init_op=local_var_init_ops,
saver=saver,
global_step=graph_info.global_step,
summary_op=None,
save_model_secs=self.params.save_model_secs,
summary_writer=summary_writer,
local_init_run_options=init_run_options)
profiler = tf.profiler.Profiler() if self.params.tfprof_file else None
if self.graph_file is not None:
path, filename = os.path.split(self.graph_file)
as_text = filename.endswith('txt')
log_fn('Writing GraphDef as %s to %s' % ( # pyformat break
'text' if as_text else 'binary', self.graph_file))
tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
path, filename, as_text)
start_standard_services = (
self.params.train_dir or
self.dataset.queue_runner_required())
target = self.cluster_manager.get_target() if self.cluster_manager else ''
with sv.managed_session(
master=target,
config=create_config_proto(self.params),
start_standard_services=start_standard_services) as sess:
# Anything that can potentially raise an OutOfRangeError with 'sess' MUST
# be under this try block. The managed_session() context manager silently
# ignores OutOfRangeError, so we must catch them and wrap them with
# a different exception type so that they can be propagated up to the
# caller.
try:
stats = self.benchmark_with_session(
sess, sv, graph_info, eval_graph_info, bcast_global_variables_op,
is_chief, summary_writer, profiler)
except tf.errors.OutOfRangeError:
raise RuntimeError(
'Received OutOfRangeError. Wrapping in Runtime error to avoid '
'Supervisor from suppressing the error. Original OutOfRangeError '
'with traceback:\n' + traceback.format_exc())
sv.stop()
if profiler:
generate_tfprof_profile(profiler, self.params.tfprof_file)
return stats
def benchmark_with_session(self, sess, supervisor, graph_info,
eval_graph_info, bcast_global_variables_op,
is_chief, summary_writer, profiler):
"""Benchmarks the graph with the given session.
Args:
sess: The session to benchmark the graph with
supervisor: The Supervisor that created the session.
graph_info: the namedtuple returned by _build_graph() which
contains all necessary information to benchmark the graph, including
named tensors/ops list, fetches, etc.
eval_graph_info: Similar to graph_info but for the eval graph if
--eval_during_training_every_n_steps is used. Otherwise, None.
bcast_global_variables_op: If Horovod is used, the op to broadcast the
global variables to all the processes. None if Horovod is not used.
is_chief: True if this is the chief process.
summary_writer: The SummaryWriter used to write summaries, or None if
summaries are not used.
profiler: The tf.profiler.Profiler, or None if tfprof is not used.
Returns:
Dictionary containing training statistics (num_workers, num_steps,
average_wall_time, images_per_sec).
"""
if self.params.backbone_model_path is not None:
self.model.load_backbone_model(sess, self.params.backbone_model_path)
if bcast_global_variables_op:
sess.run(bcast_global_variables_op)
image_producer = None
if graph_info.input_producer_op is not None:
image_producer = cnn_util.ImageProducer(
sess, graph_info.input_producer_op, self.batch_group_size,
self.params.use_python32_barrier)
image_producer.start()
if graph_info.enqueue_ops:
for i in xrange(len(graph_info.enqueue_ops)):
sess.run(graph_info.enqueue_ops[:(i + 1)])
if image_producer is not None:
image_producer.notify_image_consumption()
self.init_global_step, = sess.run([graph_info.global_step])
if self.job_name and not self.params.cross_replica_sync:
# TODO(zhengxq): Do we need to use a global step watcher at all?
global_step_watcher = GlobalStepWatcher(
sess, graph_info.global_step,
self.num_workers * self.num_warmup_batches +
self.init_global_step,
self.num_workers * (self.num_warmup_batches + self.num_batches) - 1)
global_step_watcher.start()
else:
global_step_watcher = None
eval_image_producer = None
if eval_graph_info:
# We pass local_var_init_op_group=None because the Supervisor already
# initialized local variables above. We need to have the Supervisor
# initialize the local variables, because otherwise it throws an error
# complaining that not all variables were initialized.
eval_image_producer = self._initialize_eval_graph(
eval_graph_info.enqueue_ops, eval_graph_info.input_producer_op,
local_var_init_op_group=None, sess=sess)
step_train_times = []
log_fn('Running warm up')
local_step = -1 * self.num_warmup_batches
if self.single_session:
# In single session mode, each step, the global_step is incremented by
# 1. In non-single session mode, each step, the global_step is
# incremented once per worker. This means we need to divide
# init_global_step by num_workers only in non-single session mode.
end_local_step = self.num_batches - self.init_global_step
else:
end_local_step = self.num_batches - (self.init_global_step //
self.num_workers)
if not global_step_watcher:
# In cross-replica sync mode, all workers must run the same number of
# local steps, or else the workers running the extra step will block.
done_fn = lambda: local_step >= end_local_step
else:
done_fn = global_step_watcher.done
if self.params.debugger is not None:
if self.params.debugger == 'cli':
log_fn('The CLI TensorFlow debugger will be used.')
sess = tf_debug.LocalCLIDebugWrapperSession(sess)
else:
log_fn('The TensorBoard debugger plugin will be used.')
sess = tf_debug.TensorBoardDebugWrapperSession(sess,
self.params.debugger)
mlperf.logger.log(key=mlperf.tags.TRAIN_LOOP)
skip_final_eval = False
accuracy_at_1 = None
accuracy_at_5 = None
last_eval_step = local_step
loop_start_time = time.perf_counter()
last_average_loss = None
while not done_fn():
if local_step == 0:
log_fn('Done warm up')
if graph_info.execution_barrier:
log_fn('Waiting for other replicas to finish warm up')
sess.run([graph_info.execution_barrier])
# TODO(laigd): rename 'Img' to maybe 'Input'.
header_str = ('Step\tImg/sec\t' +
self.params.loss_type_to_report.replace('/', ' '))
if self.params.print_training_accuracy or self.params.forward_only:
# TODO(laigd): use the actual accuracy op names of the model.
header_str += '\ttop_1_accuracy\ttop_5_accuracy'
log_fn(header_str)
assert len(step_train_times) == self.num_warmup_batches
# reset times to ignore warm up batch
step_train_times = []
loop_start_time = time.perf_counter()
if (summary_writer and
(local_step + 1) % self.params.save_summaries_steps == 0):
fetch_summary = graph_info.summary_op
else:
fetch_summary = None
collective_graph_key = 7 if (
self.params.variable_update == 'collective_all_reduce') else 0
(summary_str, last_average_loss) = benchmark_one_step(
sess, graph_info.fetches, local_step,
self.batch_size * (self.num_workers
if self.single_session else 1), step_train_times,
self.trace_filename, self.params.partitioned_graph_file_prefix,
profiler, image_producer, self.params, fetch_summary,
benchmark_logger=self.benchmark_logger,
collective_graph_key=collective_graph_key,
should_output_files=(self.params.variable_update != 'horovod' or
is_chief))
if summary_str is not None and is_chief:
supervisor.summary_computed(sess, summary_str)
local_step += 1
if (self.params.save_model_steps and
local_step % self.params.save_model_steps == 0 and
local_step > 0 and
is_chief):
supervisor.saver.save(sess, supervisor.save_path,
supervisor.global_step)
if (eval_graph_info and local_step > 0 and not done_fn() and
self._should_eval_during_training(local_step)):
python_global_step = sess.run(graph_info.global_step)
num_steps_since_last_eval = local_step - last_eval_step
# The INPUT_SIZE tag value might not match the
# PREPROC_NUM_TRAIN_EXAMPLES tag value, because the number of examples
# run, which is INPUT_SIZE, is rounded up to the nearest multiple of
# self.batch_size.
mlperf.logger.log(
key=mlperf.tags.INPUT_SIZE,
value=num_steps_since_last_eval * self.batch_size)
log_fn('Running evaluation at global_step {}'.format(
python_global_step))
accuracy_at_1, accuracy_at_5 = self._eval_once(
sess, summary_writer, eval_graph_info.fetches,
eval_graph_info.summary_op, eval_image_producer,
python_global_step)
last_eval_step = local_step
if (self.params.stop_at_top_1_accuracy and
accuracy_at_1 >= self.params.stop_at_top_1_accuracy):
log_fn('Stopping, as eval accuracy at least %s was reached' %
self.params.stop_at_top_1_accuracy)
skip_final_eval = True
break
else:
log_fn('Resuming training')
if eval_graph_info and self.model.reached_target():
log_fn('Stopping, as the model indicates its custom goal was reached')
skip_final_eval = True
break
loop_end_time = time.perf_counter()
# Waits for the global step to be done, regardless of done_fn.
if global_step_watcher:
while not global_step_watcher.done():
time.sleep(.25)
if not global_step_watcher:
elapsed_time = loop_end_time - loop_start_time
average_wall_time = elapsed_time / local_step if local_step > 0 else 0
images_per_sec = (self.num_workers * local_step * self.batch_size /
elapsed_time)
num_steps = local_step * self.num_workers
else:
# NOTE: Each worker independently increases the global step. So,
# num_steps will be the sum of the local_steps from each worker.
num_steps = global_step_watcher.num_steps()
elapsed_time = global_step_watcher.elapsed_time()
average_wall_time = (elapsed_time * self.num_workers / num_steps
if num_steps > 0 else 0)
images_per_sec = num_steps * self.batch_size / elapsed_time
# We skip printing images/sec if --eval_during_training_* is specified,
# because we are both processing training and evaluation images, so a
# singular "images/sec" value is meaningless.
if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
log_fn('-' * 64)
# TODO(laigd): rename 'images' to maybe 'inputs'.
log_fn('total images/sec: %.2f' % images_per_sec)
log_fn('-' * 64)
else:
log_fn('Done with training')
num_steps_since_last_eval = local_step - last_eval_step
mlperf.logger.log(
key=mlperf.tags.INPUT_SIZE,
value=num_steps_since_last_eval * self.batch_size)
python_global_step = sess.run(graph_info.global_step)
if eval_graph_info and not skip_final_eval:
log_fn('Running final evaluation at global_step {}'.format(
python_global_step))
accuracy_at_1, accuracy_at_5 = self._eval_once(
sess, summary_writer, eval_graph_info.fetches,
eval_graph_info.summary_op, eval_image_producer, python_global_step)
num_epochs_ran = (python_global_step * self.batch_size /
self.dataset.num_examples_per_epoch('train'))
mlperf.logger.log_train_epochs(num_epochs_ran)
if image_producer is not None:
image_producer.done()
if eval_image_producer is not None:
eval_image_producer.done()
if is_chief:
if self.benchmark_logger:
self.benchmark_logger.log_metric(
'average_examples_per_sec', images_per_sec, global_step=num_steps)
# Save the model checkpoint.
if self.params.train_dir is not None and is_chief:
checkpoint_path = os.path.join(self.params.train_dir, 'model.ckpt')
if not gfile.Exists(self.params.train_dir):
gfile.MakeDirs(self.params.train_dir)
supervisor.saver.save(sess, checkpoint_path, graph_info.global_step)
if graph_info.execution_barrier:
# Wait for other workers to reach the end, so this worker doesn't
# go away underneath them.
sess.run([graph_info.execution_barrier])
stats = {
'num_workers': self.num_workers,
'num_steps': num_steps,
'average_wall_time': average_wall_time,
'images_per_sec': images_per_sec
}
if last_average_loss is not None:
stats['last_average_loss'] = last_average_loss
if accuracy_at_1 is not None:
stats['top_1_accuracy'] = accuracy_at_1
if accuracy_at_5 is not None:
stats['top_5_accuracy'] = accuracy_at_5
success = bool(self.model.reached_target() or
(accuracy_at_1 and self.params.stop_at_top_1_accuracy and
accuracy_at_1 >= self.params.stop_at_top_1_accuracy))
mlperf.logger.log(key=mlperf.tags.RUN_STOP, value={'success': success})
mlperf.logger.log(key=mlperf.tags.RUN_FINAL)
return stats
def _should_eval_during_training(self, step):
"""Return True iff should run eval during training at current step."""
assert self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL
if self.params.eval_during_training_every_n_steps:
return step % self.params.eval_during_training_every_n_steps == 0
# All other --eval_during_training_* flags are converted to step numbers
# at which the model should run evaluation during training.
return step in self.eval_during_training_at_specified_steps
def _preprocess_graph(self, graph, graph_info):
"""Preprocess the graph before executing.
Depending on the params, it runs various preprocessing on the graph,
including freezing, TensorRT conversion, etc.
Args:
graph: the graph to preprocess.
graph_info: the namedtuple returned by _build_graph() which
contains all necessary information to benchmark the graph, including
named tensors/ops list, fetches, etc.
Returns:
The updated graph and graph_info with the ops/tensors/fetches updated
according to the imported graph.
"""
assert isinstance(graph_info.fetches, dict)
assert isinstance(graph_info.global_step, tf.Variable)
if not self.forward_only_and_freeze:
return (graph, graph_info)
# Get the names of the ops that need to keep during conversion.
flattened_op_names = list(
set([
v.name.split(':')[0]
for v in nest.flatten(graph_info)
if v is not None
]))
# Get variables that we don't want to freeze.
# Only keep unfreezable variables in forward_only_and_freeze mode.
# TODO(laigd): consider making global_step a constant.
variables_to_keep = {graph_info.global_step: tf.GraphKeys.GLOBAL_VARIABLES}
variables_to_keep.update({
local_variable: tf.GraphKeys.LOCAL_VARIABLES
for local_variable in self._unfreezable_local_variables(graph)
})
variable_initializers = [
variable.initializer.name for variable in variables_to_keep]
output_node_names = (
flattened_op_names +
# Add variable initializer and read ops to the output list, so
# convert_variables_to_constants() will keep them.
variable_initializers +
[variable.value().op.name for variable in variables_to_keep])
graphdef = graph.as_graph_def(add_shapes=True)
# Freeze the graph.
with graph.as_default():
with tf.Session(config=create_config_proto(self.params)) as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
graphdef = graph_util.convert_variables_to_constants(
sess,
graphdef,
output_node_names,
variable_names_blacklist=[
variable.op.name for variable in variables_to_keep
])
# Run TensorRT conversion.
if self.params.trt_mode:
# Import here instead of at top, because this will crash if TensorRT is
# not installed
from tensorflow.python.compiler.tensorrt import trt_convert # pylint: disable=g-import-not-at-top
# Avoid TF-TRT bridge from touching all variable initializer ops and their
# dependencies, since they can directly be fetched by sess.run()s that
# initialize the variables.
# pylint: disable=protected-access
name_to_input_name, _, _ = graph_util_impl._extract_graph_summary(
graphdef)
initializer_subgraph_ops = graph_util_impl._bfs_for_reachable_nodes(
variable_initializers, name_to_input_name)
# pylint: enable=protected-access
graphdef = trt_convert.create_inference_graph(
graphdef,
outputs=output_node_names + list(initializer_subgraph_ops),
max_batch_size=self.model.get_batch_size(),
max_workspace_size_bytes=self.params.trt_max_workspace_size_bytes,
precision_mode=self.params.trt_mode)
# Creates a new graph as the default and import the converted graph back.
updated_graph = tf.Graph()
def _get_tensors_or_ops(inputs):
"""Gets the updated tensors or ops from 'updated_graph'."""
def _get_fn(element):
if element is None:
return None
if ':' in element.name:
return updated_graph.get_tensor_by_name(element.name)
return updated_graph.get_operation_by_name(element.name)
if isinstance(inputs, (list, dict, tuple)):
return nest.map_structure(_get_fn, inputs)
else:
return _get_fn(inputs)
with updated_graph.as_default():
importer.import_graph_def(graph_def=graphdef, name='')
# Update the variables
for variable in variables_to_keep:
updated_variable = tf.Variable.from_proto(variable.to_proto())
tf.add_to_collection(variables_to_keep[variable], updated_variable)
if variable is graph_info.global_step:
updated_global_step = updated_variable
updated_graph_info = GraphInfo(
input_producer_op=_get_tensors_or_ops(graph_info.input_producer_op),
enqueue_ops=_get_tensors_or_ops(graph_info.enqueue_ops),
execution_barrier=_get_tensors_or_ops(graph_info.execution_barrier),
local_var_init_op_group=_get_tensors_or_ops(
graph_info.local_var_init_op_group),
fetches=_get_tensors_or_ops(graph_info.fetches),
global_step=updated_global_step,
summary_op=None)
return (updated_graph, updated_graph_info)
def _build_input_processing(self, shift_ratio=0):
""""Build the image (pre)processing portion of the model graph.
Args:
shift_ratio: shift_ratio for data_flow_ops.RecordInput.
Returns:
An InputProcessingInfo containing all the input sources to the model.
"""
input_processing_info = InputProcessingInfo(
input_producer_op=None,
input_producer_stages=None,
multi_device_iterator_input=None)
mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
if not self._doing_eval:
mlperf.logger.log(key=mlperf.tags.INPUT_BATCH_SIZE, value=self.batch_size)
# If using synthetic gpu inputs, do nothing on the cpu side.
if self.dataset.use_synthetic_gpu_inputs():
assert not self.datasets_use_prefetch
return input_processing_info
if self._doing_eval:
input_preprocessor = self.eval_input_preprocessor
mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES,
value=self.dataset.num_examples_per_epoch('validation'))
else:
input_preprocessor = self.input_preprocessor
mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_TRAIN_EXAMPLES,
value=self.dataset.num_examples_per_epoch('train'))
# Use prefetching mechanism provided by dataset input pipeline.
if self.datasets_use_prefetch:
multi_device_iterator = (
input_preprocessor.build_multi_device_iterator(
self.batch_size, len(self.devices), self.cpu_device, self.params,
self.raw_devices, self.dataset, self._doing_eval))
return input_processing_info._replace(
multi_device_iterator_input=multi_device_iterator.get_next())
# Not using dataset prefetching. Use a staging area to mimic the prefetching
# behavior instead.
with tf.device(self.cpu_device):
if self._doing_eval:
subset = 'validation'
else:
subset = 'train'
input_list = input_preprocessor.minibatch(
self.dataset,
subset=subset,
params=self.params,
shift_ratio=shift_ratio)
input_producer_op = []
input_producer_stages = []
for device_num in range(len(self.devices)):
staging_area = data_flow_ops.StagingArea(
[parts[0].dtype for parts in input_list],
shapes=[parts[0].get_shape() for parts in input_list],
shared_name='input_producer_staging_area_%d_eval_%s' %
(device_num, self._doing_eval))
input_producer_stages.append(staging_area)
for group_index in xrange(self.batch_group_size):
batch_index = group_index + device_num * self.batch_group_size
put_op = staging_area.put(
[parts[batch_index] for parts in input_list])
input_producer_op.append(put_op)
assert input_producer_op
return input_processing_info._replace(
input_producer_op=input_producer_op,
input_producer_stages=input_producer_stages)
def _maybe_initialize_fp16(self):
"""Initialize fp16 settings."""
if self.params.use_fp16 and not self._doing_eval:
init_loss_scale_val = float(self.params.fp16_loss_scale or
self.model.get_fp16_loss_scale())
self.loss_scale = None
self.loss_scale_normal_steps = None
if self.enable_auto_loss_scale or init_loss_scale_val != 1:
self.loss_scale = tf.get_variable(
name='loss_scale',
initializer=init_loss_scale_val,
dtype=tf.float32,
trainable=False)
if self.enable_auto_loss_scale:
self.loss_scale_normal_steps = tf.get_variable(
name='loss_scale_normal_steps', initializer=0, trainable=False)
def _build_model(self):
"""Build the TensorFlow graph."""
if self.datasets_use_prefetch:
assert not self.params.staged_vars
assert not self.variable_mgr.supports_staged_vars()
# Adjust seed so different workers start read different input files.
if self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
seed_adjustment = hvd.rank()
else:
seed_adjustment = 0
mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
value=self.params.tf_random_seed + seed_adjustment)
tf.set_random_seed(self.params.tf_random_seed + seed_adjustment)
mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
value=4321 + seed_adjustment)
np.random.seed(4321 + seed_adjustment)
phase_train = not (self._doing_eval or self.params.forward_only)
if self._doing_eval:
mode_string = 'evaluation'
else:
mode_string = 'training'
log_fn('Generating {} model'.format(mode_string))
losses = []
device_grads = []
all_logits = []
all_accuracy_ops = {}
gpu_compute_stage_ops = []
gpu_grad_stage_ops = []
with tf.device(self.global_step_device):
global_step = tf.train.get_or_create_global_step()
self._maybe_initialize_fp16()
# Build the processing and model for the worker.
input_producer_op = None
with tf.name_scope('input_processing'):
input_processing_info = self._build_input_processing(shift_ratio=0)
if input_processing_info.input_producer_op is not None:
input_producer_op = tf.group(*input_processing_info.input_producer_op)
update_ops = None
staging_delta_ops = []
for device_num in range(len(self.devices)):
with tf.name_scope('tower_%i' % device_num) as name_scope, (
self.variable_mgr.create_outer_variable_scope(device_num)):
results = self.add_forward_pass_and_gradients(
phase_train, device_num, device_num, input_processing_info,
gpu_compute_stage_ops, gpu_grad_stage_ops)
if self.params.backbone_model_path:
self.model.add_backbone_saver()
if phase_train:
losses.append(results['loss'])
device_grads.append(results['gradvars'])
else:
all_logits.append(results['logits'])
if not phase_train or self.params.print_training_accuracy:
for name, op in results.items():
if name.startswith('accuracy:'):
key = name[9:]
if key not in all_accuracy_ops:
all_accuracy_ops[key] = []
all_accuracy_ops[key].append(op)
if device_num == 0:
# Retain the Batch Normalization updates operations only from the
# first tower. These operations update the moving mean and moving
# variance variables, which are updated (but not used) during
# training, and used during evaluation. The moving mean and variance
# approximate the true mean and variance across all images in the
# dataset. Therefore, in replicated mode, these moving averages would
# be almost identical for each tower, and so we only update and save
# the moving averages for one tower. In parameter server mode, all
# towers share a copy of the variables so we also only need to update
# and save the moving averages once.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)
if self.datasets_use_prefetch:
assert not self.variable_mgr.staging_delta_ops
else:
staging_delta_ops = list(self.variable_mgr.staging_delta_ops)
enqueue_ops = []
if not self.datasets_use_prefetch:
if self.variable_mgr.supports_staged_vars():
for staging_ops in self.variable_mgr.staging_vars_on_devices:
gpu_compute_stage_ops.extend(
[put_op for _, (put_op, _) in six.iteritems(staging_ops)])
enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
name='gpu_compute_stage_ops_group'))
if gpu_grad_stage_ops:
staging_delta_ops += gpu_grad_stage_ops
if staging_delta_ops:
enqueue_ops.append(tf.group(*(staging_delta_ops)))
if (self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL and
self.params.variable_update == 'replicated'):
# We need to get all the update ops instead of only those for the first
# tower. This is because during evaluation, each tower will read from its
# own tower's moving averages instead of the first tower's moving
# averages.
# TODO(reedwm): Have each tower read from the first tower's moving
# averages for a slight performance gain.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
mlperf.logger.log(key=mlperf.tags.INPUT_BN_SPAN,
value=self.batch_size // len(self.raw_devices))
fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
enqueue_ops, update_ops, all_accuracy_ops,
phase_train)
return (input_producer_op, enqueue_ops, fetches)
def _build_fetches(self, global_step, all_logits, losses, device_grads,
enqueue_ops, update_ops, all_accuracy_ops, phase_train):
"""Complete construction of model graph, populating the fetches map."""
fetches = {}
if enqueue_ops:
fetches['enqueue_ops'] = enqueue_ops
for name, ops in all_accuracy_ops.items():
# For fetches that starts with 'tensor:', keep dimension and skip reducing
# them to scalars.
if name.startswith(constants.UNREDUCED_ACCURACY_OP_PREFIX):
key = name[len(constants.UNREDUCED_ACCURACY_OP_PREFIX):]
fetches[key] = tf.concat(ops, 0)
else:
fetches[name] = (
tf.reduce_sum(ops) /
(self.batch_size *
(self.num_workers if self.single_session else 1)))
if self.task_index == 0 and self.params.summary_verbosity >= 1:
tf.summary.scalar(name, fetches[name])
if not phase_train:
if self.params.forward_only:
fetches['all_logits'] = tf.concat(all_logits, 0)
return fetches
apply_gradient_devices, gradient_state = (
self.variable_mgr.preprocess_device_grads(device_grads))
# TODO(reedwm): Greatly simplify the learning rate code.
if (self.params.variable_update == 'horovod' or
self.params.variable_update == 'collective_all_reduce'):
# Each worker independently increments global_step.
examples_per_step = self.batch_size * self.num_workers
else:
# global_step is shared by all workers, and so every iteration
# global_step is incremented by num_workers.
examples_per_step = self.batch_size
if self.params.compute_lr_on_cpu:
with tf.device(self.cpu_device):
learning_rate = get_learning_rate(self.params, global_step,
self.dataset.num_examples_per_epoch(),
self.model, examples_per_step)
training_ops = []
for d, device in enumerate(apply_gradient_devices):
with tf.device(device):
with tf.name_scope('average_loss'):
average_loss = tf.reduce_mean(losses)
with tf.name_scope('get_gradients_to_apply'):
avg_grads = self.variable_mgr.get_gradients_to_apply(d,
gradient_state)
if not self.params.compute_lr_on_cpu:
# We compute the learning rate once for each device in
# `apply_gradient_devices`.
learning_rate = get_learning_rate(
self.params, global_step, self.dataset.num_examples_per_epoch(),
self.model, examples_per_step)
gradient_clip = self.params.gradient_clip
if gradient_clip is not None:
with tf.name_scope('clip_gradients'):
clipped_grads = [(tf.clip_by_value(grad, -gradient_clip,
+gradient_clip), var)
for grad, var in avg_grads]
else:
clipped_grads = avg_grads
learning_rate = tf.identity(learning_rate, name='learning_rate_tensor')
opt = get_optimizer(self.params, learning_rate)
loss_scale_params = variable_mgr_util.AutoLossScaleParams(
enable_auto_loss_scale=self.enable_auto_loss_scale,
loss_scale=self.loss_scale,
loss_scale_normal_steps=self.loss_scale_normal_steps,
inc_loss_scale_every_n=self.params.fp16_inc_loss_scale_every_n,
is_chief=not self.job_name or self.task_index == 0)
with tf.name_scope('append_apply_gradient_ops'):
self.variable_mgr.append_apply_gradients_ops(
gradient_state, opt, clipped_grads, training_ops,
loss_scale_params)
train_op = tf.group(*(training_ops + update_ops), name='train_ops_group')
with tf.device(self.cpu_device):
if self.task_index == 0 and self.params.summary_verbosity >= 1:
tf.summary.scalar('learning_rate', learning_rate)
tf.summary.scalar(self.params.loss_type_to_report, average_loss)
if self.loss_scale is not None:
tf.summary.scalar('loss_scale', self.loss_scale)
if self.loss_scale_normal_steps:
tf.summary.scalar('loss_scale_normal_steps',
self.loss_scale_normal_steps)
if self.params.summary_verbosity >= 2:
self.gradient_histogram_summary(avg_grads)
if self.params.summary_verbosity >= 3:
for grad, var in avg_grads:
if grad is not None:
tf.summary.histogram(var.op.name + '/gradients', grad)
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
fetches['train_op'] = train_op
fetches['average_loss'] = average_loss
return fetches
def gradient_histogram_summary(self, avg_grads):
"""Create histogram of log values of all non-zero gradients."""
with tf.name_scope('log_gradients_summary'):
all_grads = []
for grad, _ in avg_grads:
all_grads.append(tf.reshape(grad, [-1]))
grads = tf.abs(tf.concat(all_grads, 0))
# exclude grads with zero values.
indices_for_non_zero_grads = tf.where(tf.not_equal(grads, 0))
log_grads = tf.reshape(
tf.log(tf.gather(grads, indices_for_non_zero_grads)), [-1])
tf.summary.histogram('log_gradients', log_grads)
def _build_model_single_session(self):
"""Build the TensorFlow graph for multiple replicas in a single_session.
Returns:
input_producer_op:
enqueue_ops:
fetches:
Raises:
ValueError: optimizer not recognized.
Single session runs multiple model replicas as part of one large
distributed graph, whose global execution is always step-synchronized.
"""
# verify assumptions
assert self.params.task_index == 0
assert not self._doing_eval
assert not self.params.forward_only
assert not self.params.staged_vars
tf.set_random_seed(self.params.tf_random_seed)
np.random.seed(4321)
phase_train = True
log_fn('Generating training model')
losses = []
device_grads = []
all_logits = []
all_accuracy_ops = {}
gpu_compute_stage_ops = []
gpu_grad_stage_ops = []
with tf.device(self.global_step_device):
global_step = tf.train.get_or_create_global_step()
update_ops = []
global_input_producer_op = []
is_local = not self.job_name
if is_local:
assert self.num_workers == 1
for task_num in range(self.num_workers):
# Reset the devices that self.variable_mgr knows about to those
# belonging to the next worker (task).
self.reset_devices_for_task(task_num, is_local)
# Build the per-worker image processing
with tf.name_scope('input_processing'):
input_processing_info = self._build_input_processing(
shift_ratio=(task_num / self.num_workers))
if input_processing_info.input_producer_op is not None:
global_input_producer_op.extend(input_processing_info.input_producer_op)
# Build the per-worker model replica.
for rel_device_num in range(len(self.devices)):
abs_device_num = task_num * len(self.devices) + rel_device_num
with self.variable_mgr.create_outer_variable_scope(
abs_device_num), tf.name_scope(
'task_%i_tower_%i' % (task_num, rel_device_num)) as name_scope:
task_results = self.add_forward_pass_and_gradients(
phase_train, rel_device_num, abs_device_num,
input_processing_info, gpu_compute_stage_ops, gpu_grad_stage_ops)
if self.params.backbone_model_path:
self.model.add_backbone_saver()
if phase_train:
losses.append(task_results['loss'])
device_grads.append(task_results['gradvars'])
else:
all_logits.append(task_results['logits'])
if not phase_train or self.params.print_training_accuracy:
for name, op in task_results.items():
if name.startswith('accuracy:'):
key = name[9:]
if key not in all_accuracy_ops:
all_accuracy_ops[key] = []
all_accuracy_ops[key].append(op)
if rel_device_num == 0:
# Retain the Batch Normalization updates operations only
# from the first tower. These operations update the moving
# mean and moving variance variables, which are updated
# (but not used) during training, and used during
# evaluation. The moving mean and variance approximate the
# true mean and variance across all images in the
# dataset. Therefore, in replicated mode, these moving
# averages would be almost identical for each tower, and
# so we only update and save the moving averages for one
# tower. In parameter server mode, all towers share a copy
# of the variables so we also only need to update and save
# the moving averages once.
update_ops.extend(
tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope))
assert not self.variable_mgr.staging_delta_ops
enqueue_ops = []
if gpu_compute_stage_ops:
enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
name='gpu_compute_stage_ops'))
assert not self.variable_mgr.supports_staged_vars()
assert not gpu_grad_stage_ops
fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
enqueue_ops, update_ops, all_accuracy_ops,
phase_train)
if global_input_producer_op:
global_input_producer_op = tf.group(*global_input_producer_op)
else:
global_input_producer_op = None
return (global_input_producer_op, enqueue_ops, fetches)
def add_forward_pass_and_gradients(self,
phase_train,
rel_device_num,
abs_device_num,
input_processing_info,
gpu_compute_stage_ops,
gpu_grad_stage_ops):
"""Add ops for forward-pass and gradient computations."""
nclass = self.dataset.num_classes
if self.datasets_use_prefetch:
assert input_processing_info.multi_device_iterator_input, (
'multi_device_iterator_input cannot be None if '
'datasets_use_prefetch=True')
input_list = (
input_processing_info.multi_device_iterator_input[rel_device_num])
else:
if not self.dataset.use_synthetic_gpu_inputs():
input_producer_stage = input_processing_info.input_producer_stages[
rel_device_num]
with tf.device(self.cpu_device):
host_input_list = input_producer_stage.get()
with tf.device(self.raw_devices[rel_device_num]):
gpu_compute_stage = data_flow_ops.StagingArea(
[inp.dtype for inp in host_input_list],
shapes=[inp.get_shape() for inp in host_input_list])
# The CPU-to-GPU copy is triggered here.
gpu_compute_stage_op = gpu_compute_stage.put(host_input_list)
input_list = gpu_compute_stage.get()
gpu_compute_stage_ops.append(gpu_compute_stage_op)
else:
with tf.device(self.raw_devices[rel_device_num]):
# Minor hack to avoid H2D copy when using synthetic data
input_list = self.model.get_synthetic_inputs(
BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME, nclass)
# Labels reshaping happens all on gpu:0. Reshaping synthetic labels on
# multiple devices slows down XLA computation for an unknown reason.
# TODO(b/116875203): Find/address root cause of XLA slow down.
labels_device_placement_hack = (
self.dataset.use_synthetic_gpu_inputs() and self.params.xla_compile)
def device_aware_reshape(tensor, shape):
device = self.devices[rel_device_num]
# Labels are int32, place reshapes on gpu:0 (no device placement) when the
# hack is enabled.
if labels_device_placement_hack and tensor.dtype == tf.int32:
device = ''
with tf.device(device):
return tf.reshape(tensor, shape=shape)
subset = 'validation' if self._doing_eval else 'train'
input_shapes = self.model.get_input_shapes(subset)
input_list = [
device_aware_reshape(input_list[i], shape=input_shapes[i])
for i in range(len(input_list))
]
def forward_pass_and_gradients():
"""Builds forward pass and gradient computation network.
When phase_train=True and print_training_accuracy=False:
return [loss] + grads
When phase_train=True and print_training_accuracy=True:
return [logits, loss] + grads
When phase_train=False,
return [logits]
Its output can always be unpacked by
```
outputs = forward_pass_and_gradients()
logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
```
Returns:
outputs: A list of tensors depending on different modes.
"""
build_network_result = self.model.build_network(
input_list, phase_train, nclass)
logits = build_network_result.logits
if not phase_train:
return [logits]
base_loss = self.model.loss_function(input_list, build_network_result)
params = self.variable_mgr.trainable_variables_on_device(
rel_device_num, abs_device_num)
l2_loss = None
total_loss = base_loss
with tf.name_scope('l2_loss'):
fp32_params = params
if self.model.data_type == tf.float16 and self.params.fp16_vars:
# fp16 reductions are very slow on GPUs, so cast to fp32 before
# calling tf.nn.l2_loss and tf.add_n.
# TODO(b/36217816): Once the bug is fixed, investigate if we should do
# this reduction in fp16.
fp32_params = (tf.cast(p, tf.float32) for p in params)
filtered_params = self.model.filter_l2_loss_vars(fp32_params)
if rel_device_num == len(self.devices) - 1:
# We compute the L2 loss for only one device instead of all of them,
# because the L2 loss for each device is the same. To adjust for this,
# we multiply the L2 loss by the number of devices. We choose the
# last device because for some reason, on a Volta DGX1, the first four
# GPUs take slightly longer to complete a step than the last four.
# TODO(reedwm): Shard the L2 loss computations across GPUs.
if self.params.single_l2_loss_op:
# TODO(reedwm): If faster, create a fused op that does the L2 loss
# on multiple tensors, and use that instead of concatenating
# tensors.
reshaped_params = [tf.reshape(p, (-1,)) for p in filtered_params]
l2_loss = tf.nn.l2_loss(tf.concat(reshaped_params, axis=0))
else:
l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in filtered_params])
weight_decay = self.params.weight_decay
mlperf.logger.log(key=mlperf.tags.OPT_WEIGHT_DECAY, value=weight_decay)
if (weight_decay is not None and weight_decay != 0. and
l2_loss is not None):
mlperf.logger.log(key=mlperf.tags.MODEL_L2_REGULARIZATION,
value=weight_decay)
total_loss += len(self.devices) * weight_decay * l2_loss
aggmeth = tf.AggregationMethod.DEFAULT
scaled_loss = (total_loss if self.loss_scale is None
else total_loss * self.loss_scale)
grads = tf.gradients(scaled_loss, params, aggregation_method=aggmeth)
if self.params.sparse_to_dense_grads:
# Passing a sparse gradient to convert_to_tensor turns it into a dense
# gradient. A sparse gradient is an instance of tf.IndexedSlices.
# convert_to_tensor does not modify dense tensors.
grads = [tf.convert_to_tensor(g) for g in grads]
if self.loss_scale is not None:
# TODO(reedwm): If automatic loss scaling is not used, we could avoid
# these multiplications by directly modifying the learning rate instead.
# If this is done, care must be taken to ensure that this scaling method
# is correct, as some optimizers square gradients and do other
# operations which might not be compatible with modifying both the
# gradients and the learning rate.
grads = [
grad * tf.cast(1. / self.loss_scale, grad.dtype) for grad in grads
]
if self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
if self.params.horovod_device:
horovod_device = '/%s:0' % self.params.horovod_device
else:
horovod_device = ''
# All-reduce gradients using Horovod.
grads = [hvd.allreduce(grad, average=False, device_dense=horovod_device)
for grad in grads]
if self.params.staged_vars:
grad_dtypes = [grad.dtype for grad in grads]
grad_shapes = [grad.shape for grad in grads]
grad_stage = data_flow_ops.StagingArea(grad_dtypes, grad_shapes)
grad_stage_op = grad_stage.put(grads)
# In general, this decouples the computation of the gradients and
# the updates of the weights.
# During the pipeline warm up, this runs enough training to produce
# the first set of gradients.
gpu_grad_stage_ops.append(grad_stage_op)
grads = grad_stage.get()
if self.params.loss_type_to_report == 'total_loss':
loss = total_loss
else:
loss = base_loss
if self.params.print_training_accuracy:
return [logits, loss] + grads
else:
return [loss] + grads
def unpack_forward_pass_and_gradients_output(forward_pass_and_grad_outputs):
"""Unpacks outputs from forward_pass_and_gradients.
Args:
forward_pass_and_grad_outputs: Output from forward_pass_and_gradients.
Returns:
logits: Unscaled probability distribution from forward pass.
If unavailable, None is returned.
loss: Loss function result from logits.
If unavailable, None is returned.
grads: Gradients for all trainable variables.
If unavailable, None is returned.
"""
logits = None
# logits is only fetched in non-train mode or when
# print_training_accuracy is set.
if not phase_train or self.params.print_training_accuracy:
logits = forward_pass_and_grad_outputs.pop(0)
loss = (
forward_pass_and_grad_outputs[0]
if forward_pass_and_grad_outputs else None)
grads = (
forward_pass_and_grad_outputs[1:]
if forward_pass_and_grad_outputs else None)
return logits, loss, grads
def make_results(logits, loss, grads):
"""Generate results based on logits, loss and grads."""
results = {} # The return value
if logits is not None:
results['logits'] = logits
accuracy_ops = self.model.accuracy_function(input_list, logits)
for name, op in accuracy_ops.items():
results['accuracy:' + name] = op
if loss is not None:
results['loss'] = loss
if grads is not None:
param_refs = self.variable_mgr.trainable_variables_on_device(
rel_device_num, abs_device_num, writable=True)
results['gradvars'] = list(zip(grads, param_refs))
return results
with tf.device(self.devices[rel_device_num]):
outputs = maybe_compile(forward_pass_and_gradients, self.params)
logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
return make_results(logits, loss, grads)
def get_input_preprocessor(self):
"""Returns the image preprocessor to used, based on the model.
Returns:
The image preprocessor, or None if synthetic data should be used.
"""
shift_ratio = 0
if self.job_name:
# shift_ratio prevents multiple workers from processing the same batch
# during a step
shift_ratio = self.task_index / self.num_workers
processor_class = self.dataset.get_input_preprocessor(
self.params.input_preprocessor)
assert processor_class
subset = 'validation' if self._doing_eval else 'train'
return processor_class(
self.batch_size * self.batch_group_size,
self.model.get_input_shapes(subset),
len(self.devices) * self.batch_group_size,
dtype=self.model.data_type,
train=(not self._doing_eval),
# TODO(laigd): refactor away image model specific parameters.
distortions=self.params.distortions,
resize_method=self.resize_method,
shift_ratio=shift_ratio,
summary_verbosity=self.params.summary_verbosity,
distort_color_in_yiq=self.params.distort_color_in_yiq,
fuse_decode_and_crop=self.params.fuse_decode_and_crop,
match_mlperf=self.params.ml_perf)
def add_sync_queues_and_barrier(self, name_prefix, enqueue_after_list):
"""Adds ops to enqueue on all worker queues.
Args:
name_prefix: prefixed for the shared_name of ops.
enqueue_after_list: control dependency from ops.
Returns:
An op that should be used as control dependency before starting next step.
"""
self.sync_queue_counter += 1
with tf.device(self.sync_queue_devices[(
self.sync_queue_counter % len(self.sync_queue_devices))]):
sync_queues = [
tf.FIFOQueue(self.num_workers, [tf.bool], shapes=[[]],
shared_name='%s%s' % (name_prefix, i))
for i in range(self.num_workers)]
queue_ops = []
# For each other worker, add an entry in a queue, signaling that it can
# finish this step.
token = tf.constant(False)
with tf.control_dependencies(enqueue_after_list):
for i, q in enumerate(sync_queues):
if i == self.task_index:
queue_ops.append(tf.no_op())
else:
queue_ops.append(q.enqueue(token))
# Drain tokens off queue for this worker, one for each other worker.
queue_ops.append(
sync_queues[self.task_index].dequeue_many(len(sync_queues) - 1))
return tf.group(*queue_ops)
def _is_mkl_flag_absent(mkl_flag):
return not (absl_flags.FLAGS.is_parsed() and mkl_flag in absl_flags.FLAGS
and absl_flags.FLAGS[mkl_flag].present)
def _print_os_env_ignored_warning(mkl_flag, flag_default_val, os_env_var):
tf.logging.warn(
('OS ENV variable %s=%s is ignored and script default: '
'%s is used. Use --%s to override.') %
(os_env_var, os.environ[os_env_var], flag_default_val, mkl_flag))
def set_default_param_values_and_env_vars(params):
"""Sets up the default param values and environment variables ."""
if params.batchnorm_persistent:
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
else:
os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
if params.winograd_nonfused:
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
else:
os.environ.pop('TF_ENABLE_WINOGRAD_NONFUSED', None)
if params.autotune_threshold:
os.environ['TF_AUTOTUNE_THRESHOLD'] = str(params.autotune_threshold)
os.environ['TF_SYNC_ON_FINISH'] = str(int(params.sync_on_finish))
argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Sets environment variables for MKL
# If OS ENV vars are overridden by script defaults, a warning msg is printed.
if params.mkl:
mkl_flags = ['kmp_blocktime', 'kmp_settings', 'kmp_affinity',
'num_intra_threads']
for mkl_flag in mkl_flags:
os_env_var = mkl_flag.upper()
if mkl_flag == 'num_intra_threads':
os_env_var = 'OMP_NUM_THREADS'
flag_val = str(getattr(params, mkl_flag))
if _is_mkl_flag_absent(mkl_flag) and os_env_var in os.environ:
_print_os_env_ignored_warning(mkl_flag, flag_val, os_env_var)
os.environ[os_env_var] = flag_val
if mkl_flag == 'num_intra_threads' and not params.num_intra_threads:
os.environ.pop(os_env_var, None)
# Sets GPU thread settings
if params.device.lower() == 'gpu':
params = params._replace(gpu_thread_mode=params.gpu_thread_mode.lower())
if params.gpu_thread_mode not in ['global', 'gpu_shared', 'gpu_private']:
raise ValueError('Invalid gpu_thread_mode: %s' % params.gpu_thread_mode)
os.environ['TF_GPU_THREAD_MODE'] = params.gpu_thread_mode
if params.per_gpu_thread_count and params.gpu_thread_mode == 'global':
raise ValueError(
'Invalid per_gpu_thread_count with gpu_thread_mode=global: %s' %
params.per_gpu_thread_count)
# Default to two threads. One for the device compute and the other for
# memory copies.
per_gpu_thread_count = params.per_gpu_thread_count or 2
total_gpu_thread_count = per_gpu_thread_count * params.num_gpus
if params.gpu_thread_mode == 'gpu_private':
os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
elif params.gpu_thread_mode == 'gpu_shared':
os.environ['TF_GPU_THREAD_COUNT'] = str(total_gpu_thread_count)
cpu_count = multiprocessing.cpu_count()
if not params.num_inter_threads and params.gpu_thread_mode in [
'gpu_private', 'gpu_shared'
]:
main_thread_count = max(cpu_count - total_gpu_thread_count, 1)
params = params._replace(num_inter_threads=main_thread_count)
if (params.datasets_use_prefetch and
params.datasets_num_private_threads is None):
# From the total cpu thread count, subtract the total_gpu_thread_count,
# and then 2 threads per GPU device for event monitoring and sending /
# receiving tensors
num_monitoring_threads = 2 * params.num_gpus
num_private_threads = max(
cpu_count - total_gpu_thread_count - num_monitoring_threads, 1)
params = params._replace(datasets_num_private_threads=num_private_threads)
return params
def setup(params):
"""Sets up the environment that BenchmarkCNN should run in.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
Returns:
A potentially modified params.
Raises:
ValueError: invalid parames combinations.
"""
# Set up environment variables before doing any other global initialization to
# make sure it uses the appropriate environment variables.
params = set_default_param_values_and_env_vars(params)
# horovod needs to be initialized before create_config_proto() call since
# it will be used in config generation if enabled.
if params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
hvd.init()
platforms_util.initialize(params, create_config_proto(params))
if not params.job_name:
# Create a dummy session to initialize TF global variables using the input
# params. Otherwise, ListDevices function may create global devices using
# the default config instead of using the user provided config.
#
# TODO(hinsu): Find a way to achieve the same for distributed benchmark. It
# is not legal to create distributed session after local session. It is also
# not possible to create distributed session here as that results in
# multiple creation of ClusterManager and Server.
with tf.Session(config=create_config_proto(params)) as sess:
del sess
return params
def maybe_compile(computation, params):
if params and params.xla_compile:
return tf.xla.experimental.compile(computation)
else:
return computation()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests running benchmark_cnn in distributed mode.
This is done by spawning one process per task. Each process runs
benchmark_cnn_distributed_test_runner.py.
The output for each process is written to disk and can be viewed to debug tests.
See get_test_output_dir() in platforms/default/util.py for more info.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import os
import subprocess
import time
import unittest
from absl import flags as absl_flags
import portpicker
import six
import tensorflow.compat.v1 as tf
import flags
import test_util
from platforms import util as platforms_util
FLAGS = absl_flags.FLAGS
def _convert_params_to_flags_list(params):
"""Converts Params to a list of flags. Skips default-valued parameters.
E.g., converts
benchmark_cnn.make_params(batch_size=32, model='resnet50')
to
['--batch_size=32', '--model=resnet50']
Args:
params: Params for BenchmarkCNN.
Returns:
A list of flags.
"""
return [
'--%s=%s' % (k, str(v)) for k, v in six.iteritems(params._asdict())
if v != flags.param_specs[k].default_value
]
# When outputting a process's output in the log, maximum number of characters
# to output. The log system does not allow us to output more than this in a
# single log message, but this limit is also useful to avoid the logs from
# becoming too large (the full process output is written to disk).
MAX_OUTPUT_CHARS = 15000
# A process. name is a string identifying the process in logs. stdout and
# stderr are file objects of the process's stdout and stderr, respectively.
_ProcessInfo = namedtuple('_ProcessInfo', ['name', 'popen', 'stdout', 'stderr'])
def _create_task_process(job_name, task_index, args, env, output_dir):
"""Creates a process for a single task for benchmark_cnn.
Args:
job_name: 'worker' or 'ps' or ''. Empty string used for non-distributed
mode.
task_index: The index of the task within the cluster.
args: A list of arguments to pass to the task. This function additionally
sets --task_index and --job_name
env: The environment to use for the task.
output_dir: Where to place the output files, storing the task's stdout and
stderr.
Returns:
A _ProcessInfo namedtuple of the running process. The stdout and stderr
fields of this tuple must be closed by the caller once the process ends.
"""
args = args[:]
args += ['--task_index=%s' % task_index, '--job_name=%s' % job_name]
name_prefix = job_name or 'local'
process_name = '%s_%s' % (name_prefix, task_index)
tf.logging.info('Spawning %s process: %s' % (process_name, ' '.join(args)))
stdout_filename = os.path.join(output_dir, '%s_stdout.txt' % process_name)
stderr_filename = os.path.join(output_dir, '%s_stderr.txt' % process_name)
stdout_file = open(stdout_filename, 'w+')
stderr_file = open(stderr_filename, 'w+')
popen = subprocess.Popen(
args, stdout=stdout_file, stderr=stderr_file, env=env)
return _ProcessInfo(process_name, popen, stdout_file, stderr_file)
def _wait_for_processes(wait_processes, kill_processes):
"""Waits until all `wait_processes` finish, then kills `kill_processes`.
Fails an assert if a process in `wait_processes` finishes unsuccessfully.
The processes in `kill_processes` are assumed to never finish so they are
killed.
Args:
wait_processes: A list of _ProcessInfo tuples. This function will wait
for each to finish.
kill_processes: A list of _ProcessInfo tuples. Each will be killed once
every process in `wait_processes` is finished.
Returns:
A list of strings, each which is a string of the stdout of a wait process.
"""
wait_process_stdouts = [None] * len(wait_processes)
finished_wait_processes = set()
while len(finished_wait_processes) < len(wait_processes):
for i, wait_process in enumerate(wait_processes):
if i in finished_wait_processes:
continue
ret_code = wait_process.popen.poll()
if ret_code is None:
continue
tf.logging.info('{} finished'.format(wait_process.name))
wait_process.stdout.seek(0)
wait_process_stdouts[i] = wait_process.stdout.read()
tf.logging.info('stdout for {} (last {} chars): {}\n'.format(
wait_process.name, MAX_OUTPUT_CHARS,
wait_process_stdouts[i][-MAX_OUTPUT_CHARS:]))
wait_process.stderr.seek(0)
tf.logging.info('stderr for {} (last {} chars): {}\n'.format(
wait_process.name, MAX_OUTPUT_CHARS,
wait_process.stderr.read()[-MAX_OUTPUT_CHARS:]))
assert ret_code == 0, 'Process failed with return code %d' % ret_code
finished_wait_processes.add(i)
for kill_process in kill_processes:
ret_code = kill_process.popen.poll()
# kill processes should not end until we kill them.
assert ret_code is None, 'Process returned early with code %d' % ret_code
time.sleep(0.25)
tf.logging.info('All wait processes finished')
for i, kill_process in enumerate(kill_processes):
# Kill each kill process.
kill_process.popen.kill()
kill_process.popen.wait()
kill_process.stdout.seek(0)
tf.logging.info('stdout for {} (last {} chars): {}\n'.format(
kill_process.name, MAX_OUTPUT_CHARS,
kill_process.stdout.read()[-MAX_OUTPUT_CHARS:]))
kill_process.stderr.seek(0)
tf.logging.info('stderr for {} (last {} chars): {}\n'.format(
kill_process.name, MAX_OUTPUT_CHARS,
kill_process.stderr.read()[-MAX_OUTPUT_CHARS:]))
return wait_process_stdouts
def _spawn_benchmark_processes(output_dir_path, num_workers, num_ps,
num_controllers, params):
"""Run training or evaluation in spawned processes.
Runs locally if num_workers == 1, num_ps == 0, and num_controllers == 0,
otherwise runs in distributed mode. In either case, one process is spawned
per worker and ps. Waits for training/evaluation to finish before returning.
Args:
output_dir_path: Relative path where stdout and stderr files will be
placed.
num_workers: Number of workers to spawn.
num_ps: Number of ps processes to spawn.
num_controllers: Number of controller processes to spawn (must be 0 or 1).
params: Params for BenchmarkCNN in each subprocess.
Returns:
A list output_list of outputs from all processes that output the
images/sec and accuracy. This process is the controller host in
distributed_all_reduce, and the workers otherwise. output_list[i] is a
list of lines from the ith worker's stdout.
"""
run_distributed = num_workers != 1 or num_ps != 0 or num_controllers != 0
if params.variable_update == 'distributed_all_reduce':
assert num_controllers == 1 or not run_distributed
assert num_ps == 0
else:
assert num_controllers == 0
output_base_dir = platforms_util.get_test_output_dir()
output_dir = os.path.join(output_base_dir, output_dir_path)
os.makedirs(output_dir)
tf.logging.info('Outputs of processes will be outputted to: %s' % output_dir)
args = platforms_util.get_command_to_run_python_module(
'benchmark_cnn_distributed_test_runner')
args += _convert_params_to_flags_list(params)
if run_distributed:
worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
controller_ports = [portpicker.pick_unused_port()
for _ in range(num_controllers)]
# The numerator is 0.7 instead of 1 to leave some memory for the Cuda
# runtime, etc.
gpu_memory_frac = 0.7 / num_workers
args += [
'--gpu_memory_frac_for_testing=%f' % gpu_memory_frac,
'--worker_hosts=' + ','.join('localhost:%d' % p for p in worker_ports)
]
if num_ps > 0:
ps_hosts_str = ','.join('localhost:%d' % p for p in ps_ports)
args.append('--ps_hosts=' + ps_hosts_str)
else:
controller_host_str = ','.join('localhost:%d' % p
for p in controller_ports)
args.append('--controller_host=' + controller_host_str)
env = os.environ.copy()
# Allow stdout to be viewed before the process ends.
env['PYTHONUNBUFFERED'] = '1'
worker_processes = []
ps_processes = []
controller_processes = []
try:
for i in range(num_workers):
job_name = 'worker' if run_distributed else ''
process = _create_task_process(job_name, i, args, env, output_dir)
worker_processes.append(process)
# Don't let ps or controller processes use the gpu.
env['CUDA_VISIBLE_DEVICES'] = ''
for i in range(num_ps):
process = _create_task_process('ps', i, args, env, output_dir)
ps_processes.append(process)
for i in range(num_controllers):
process = _create_task_process('controller', i, args, env, output_dir)
controller_processes.append(process)
# If all distributed all reduce mode is being used, the controller process
# finishes and the worker processes block forever. Otherwise, the worker
# processes finish and the ps processes block forever. We set
# wait_processes and kill_processes accordingly.
if controller_processes:
wait_processes = controller_processes
kill_processes = worker_processes
else:
wait_processes = worker_processes
kill_processes = ps_processes
outputs = _wait_for_processes(wait_processes, kill_processes)
finally:
for process in worker_processes + ps_processes + controller_processes:
try:
process.popen.kill()
except OSError:
pass # It's OK (and expected) if the process already exited.
process.stdout.close()
process.stderr.close()
return [output.splitlines() for output in outputs]
# When this test class is run, a method will fail about 0.3% of the time with a
# gRPC error. It is not clear why this occurs.
# TODO(reedwm): Fix this test class.
class TfCnnBenchmarksDistributedTest(tf.test.TestCase):
"""Tests running benchmark_cnn in distributed mode."""
# We cannot check for a GPU via tf.test.is_gpu_available() before the tests in
# this class because it allocates all the GPU memory which would cause the
# spawned processes to run out of GPU memory.
def _test_distributed(self,
test_name,
num_workers,
num_ps,
params,
num_controllers=0,
check_output_values=False,
skip=None):
# TODO(reedwm): check_output_values should default to True and be enabled
# on every test. See the TODO in benchmark_cnn_test.py.
def run_fn(run_type, inner_params):
output_dir_path = os.path.join(test_name, run_type)
if run_type == 'Evaluation':
# Distributed evaluation is not supported, so we use a single process.
# We still must spawn another process, because if we evaluate in the
# current process, it would allocate the GPU memory causing future test
# methods to fail.
if inner_params.variable_update == 'distributed_replicated':
inner_params = inner_params._replace(variable_update='replicated')
return _spawn_benchmark_processes(
output_dir_path, num_workers=1, num_ps=0, num_controllers=0,
params=inner_params)
else:
return _spawn_benchmark_processes(output_dir_path, num_workers, num_ps,
num_controllers, inner_params)
return test_util.train_and_eval(self, run_fn, params,
check_output_values=check_output_values,
skip=skip)
def testParameterServer(self):
test_name = 'testParameterServer'
params = test_util.get_params(test_name)
self._test_distributed(test_name, 2, 2, params)
def testParameterServerStaged(self):
test_name = 'testParameterServerStaged'
params = test_util.get_params(test_name)._replace(staged_vars=True)
self._test_distributed(test_name, 2, 2, params)
def testReplicated(self):
test_name = 'testReplicated'
params = test_util.get_params(test_name)._replace(
variable_update='distributed_replicated')
self._test_distributed(test_name, 2, 2, params)
def testAllReducePsgpu(self):
test_name = 'testAllReducePsgpu'
flags_dict = test_util.get_params(test_name)._replace(
variable_update='distributed_all_reduce',
all_reduce_spec='psgpu#4')
self._test_distributed(test_name, 2, 0, flags_dict, num_controllers=1)
def testAllReducePscpuXring(self):
test_name = 'testAllReducePscpuXring'
flags_dict = test_util.get_params(test_name)._replace(
variable_update='distributed_all_reduce',
all_reduce_spec='pscpu:2k:xring')
self._test_distributed(test_name, 2, 0, flags_dict, num_controllers=1)
def testForwardOnly(self):
test_name = 'testForwardOnly'
params = test_util.get_params(test_name)._replace(forward_only=True)
# Evaluation is not supported with --forward_only, so we set skip='eval'.
self._test_distributed(test_name, 2, 2, params, skip='eval')
def testSingleWorkerAndPs(self):
test_name = 'testSingleWorkerAndPs'
params = test_util.get_params(test_name)
self._test_distributed(test_name, 1, 1, params)
def testThreeWorkersAndPses(self):
test_name = 'testThreeWorkersAndPses'
params = test_util.get_params(test_name)
self._test_distributed(test_name, 3, 3, params)
def testOneWorkerThreePses(self):
test_name = 'testOneWorkerThreePses'
params = test_util.get_params(test_name)
self._test_distributed(test_name, 1, 3, params)
def testThreeWorkersOnePs(self):
test_name = 'testThreeWorkersOnePs'
params = test_util.get_params(test_name)
self._test_distributed(test_name, 3, 1, params)
def testNoPrintTrainingAccuracy(self):
test_name = 'testNoPrintTrainingAccuracy'
params = test_util.get_params(test_name)._replace(
print_training_accuracy=False)
self._test_distributed(test_name, 2, 2, params)
def testRmspropParameterServer(self):
test_name = 'testRmspropParameterServer'
params = test_util.get_params(test_name)._replace(optimizer='rmsprop')
self._test_distributed(test_name, 2, 2, params)
def testMomentumReplicated(self):
test_name = 'testMomentumReplicated'
params = test_util.get_params(test_name)._replace(
optimizer='momentum', variable_update='distributed_replicated')
self._test_distributed(test_name, 2, 2, params)
def testNoCrossReplicaSyncParameterServerStaged(self):
test_name = 'testNoCrossReplicaSyncParameterServerStaged'
params = test_util.get_params(test_name)._replace(
staged_vars=True, cross_replica_sync=False)
self._test_distributed(test_name, 2, 2, params)
def testSingleGpu(self):
test_name = 'testSingleGpu'
params = test_util.get_params(test_name)._replace(num_gpus=1)
self._test_distributed(test_name, 2, 2, params)
def testBatchGroupSize(self):
test_name = 'testBatchGroupSize'
params = test_util.get_params(test_name)._replace(
batch_group_size=4, num_batches=100, num_warmup_batches=5)
self._test_distributed(test_name, 2, 2, params)
def testFp16WithFp32Vars(self):
test_name = 'testFp16WithFp32Vars'
params = test_util.get_params(test_name)._replace(
use_fp16=True, fp16_vars=False)
self._test_distributed(test_name, 2, 2, params)
def testFp16WithFp16Vars(self):
test_name = 'testFp16WithFp16Vars'
params = test_util.get_params(test_name)._replace(
use_fp16=True, fp16_vars=True, fp16_loss_scale=1.)
self._test_distributed(test_name, 2, 2, params)
def testFp16Replicated(self):
test_name = 'testFp16Replicated'
params = test_util.get_params(test_name)._replace(
use_fp16=True, variable_update='distributed_replicated')
self._test_distributed(test_name, 2, 2, params)
@unittest.skip('b/147310862: Fails for unknown reason')
def testReplicatedRealData(self):
test_name = 'testReplicatedRealData'
imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
'fake_tf_record_data')
params = test_util.get_params(test_name)._replace(
variable_update='distributed_replicated',
data_dir=imagenet_dir,
data_name='imagenet')
self._test_distributed(test_name, 2, 2, params)
class DistributedVariableUpdateTest(tf.test.TestCase):
"""Tests that variables are updated correctly in distributed mode."""
def _test_variable_update(self,
test_name,
num_workers,
num_ps,
params,
num_controllers=0):
"""Tests variables are updated correctly when the given params are used."""
output_dir_path = os.path.join(test_name, 'variable_update')
logs = _spawn_benchmark_processes(output_dir_path, num_workers, num_ps,
num_controllers, params)
actual_losses = []
for worker_logs in logs:
outputs = test_util.get_training_outputs_from_logs(
worker_logs, params.print_training_accuracy)
actual_losses.append([x.loss for x in outputs])
inputs = test_util.get_fake_var_update_inputs()
expected_losses = test_util.TestCNNModel().manually_compute_losses(
inputs, num_workers, params)
if params.variable_update == 'distributed_all_reduce':
# In distributed all reduce, each step, the controller outputs the average
# of the loss from each worker. So we modify expected losses accordingly.
# E.g, we change [[1, 2], [4, 5]] to [[2.5, 3.5]]
expected_losses = [[sum(losses) / num_workers
for losses in zip(*expected_losses)]]
rtol = 3e-2 if params.use_fp16 else 1e-5
for worker_actual_losses, worker_expected_losses in zip(actual_losses,
expected_losses):
self.assertAllClose(worker_actual_losses[:len(worker_expected_losses)],
worker_expected_losses, rtol=rtol, atol=0.)
def _test_variable_updates(self, test_name, params):
"""Tests variables are updated correctly with various variable updates."""
# Unfortunately, distributed parameter server is non-deterministic with
# multiple workers, because one worker may write to a variable before
# another worker reads it. This probably does not harm training, but it
# does mean we cannot easily test that case. So, we use one worker.
self._test_variable_update(
test_name + '_ps', num_workers=1, num_ps=2, num_controllers=0,
params=params._replace(variable_update='parameter_server'))
self._test_variable_update(
test_name + '_rep', num_workers=2, num_ps=1, num_controllers=0,
params=params._replace(variable_update='distributed_replicated'))
self._test_variable_update(
test_name + '_allreduce', num_workers=2, num_ps=0, num_controllers=1,
params=params._replace(variable_update='distributed_all_reduce',
all_reduce_spec='psgpu#%d' % params.num_gpus))
def testVarUpdateDefault(self):
params = test_util.get_var_update_params()
self._test_variable_updates('testVarUpdateDefault', params)
def testVarUpdateCpuAsLocalParamDevice(self):
params = test_util.get_var_update_params()._replace(
local_parameter_device='cpu')
self._test_variable_updates('testVarUpdateCpuAsLocalParamDevice', params)
def testVarUpdateFp16(self):
params = test_util.get_var_update_params()._replace(use_fp16=True)
self._test_variable_updates('testVarUpdateFp16', params)
def testVarUpdateResourceVars(self):
params = test_util.get_var_update_params()._replace(use_resource_vars=True)
self._test_variable_updates('testVarUpdateResourceVars', params)
if __name__ == '__main__':
tf.disable_v2_behavior()
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment