Commit ee3997b3 authored by qianyj's avatar qianyj
Browse files

new tf branch for dtk21.10.1

parent 2795dc1f
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Checkout repository, download data and build docker image."""
from __future__ import print_function
import argparse
import json
import logging
import os
import shutil
import sys
import tempfile
import time
import perfzero.device_utils as device_utils
import perfzero.perfzero_config as perfzero_config
import perfzero.utils as utils
def _temporary_file_name(parent_dir, base_name):
"""Returns a temp name of the form <parent-dir>/<random>/<base-name>."""
if not os.path.isdir(parent_dir):
os.makedirs(parent_dir)
temp_dir = tempfile.mkdtemp(dir=parent_dir)
return os.path.join(temp_dir, base_name)
def _load_docker_image(FLAGS, workspace_dir, setup_execution_time):
"""Runs docker load --input_image <FLAGS.dockerfile_path>.
Fetches FLAGS.dockerfile_path to workspace_dir/<temp-dir>/local_docker first.
Runs docker load --input <path-to-local-docker>.
Deletes workspace_dir/<temp-dir> after the docker image is loaded.
Args:
FLAGS: parser.parse_known_args object.
workspace_dir: String - The path to use for intermediate artifacts.
setup_execution_time: Map from string->double containing wall times for
different operations. This will have insertions describing the docker
setup time.
"""
load_docker_start_time = time.time()
local_docker_image_path = _temporary_file_name(workspace_dir, 'local_docker')
utils.download_data([{'url': FLAGS.dockerfile_path,
'local_path': local_docker_image_path,
'decompress': False}])
setup_execution_time['fetch_docker'] = time.time() - load_docker_start_time
docker_load_cmd = 'docker load --input {}'.format(local_docker_image_path)
try:
utils.run_commands(
[docker_load_cmd,
'docker images' # Print loaded image list.
])
setup_execution_time['load_docker'] = time.time() - load_docker_start_time
finally:
logging.info('removing parent dir of local docker image copy %s',
local_docker_image_path)
shutil.rmtree(os.path.dirname(local_docker_image_path))
def _create_docker_image(FLAGS, project_dir, workspace_dir,
setup_execution_time):
"""Creates a docker image.
Args:
FLAGS: parser.parse_known_args object.
project_dir: String - The current project path.
workspace_dir: String - The path to use for intermediate artifacts.
setup_execution_time: Map from string->double containing wall times for
different operations. This will have insertions describing the docker
setup time.
"""
# Create docker image
docker_start_time = time.time()
docker_context = os.path.join(workspace_dir, 'resources')
# Necessary in case we don't have a local .whl file.
utils.create_empty_file(docker_context, 'EMPTY')
# Download TensorFlow pip package from Google Cloud Storage and modify package
# path accordingly, if applicable
local_tensorflow_pip_spec = None
if (FLAGS.tensorflow_pip_spec and
(FLAGS.tensorflow_pip_spec.startswith('gs://') or
FLAGS.tensorflow_pip_spec.startswith('file://'))):
local_pip_filename = os.path.basename(FLAGS.tensorflow_pip_spec)
local_pip_path = os.path.join(docker_context, local_pip_filename)
utils.download_data([{'url': FLAGS.tensorflow_pip_spec,
'local_path': local_pip_path}])
# Update path to pip wheel file for the Dockerfile. Note that this path has
# to be relative to the docker context (absolute path will not work).
FLAGS.tensorflow_pip_spec = local_pip_filename
local_tensorflow_pip_spec = local_pip_filename
else:
local_tensorflow_pip_spec = 'EMPTY'
dockerfile_path = FLAGS.dockerfile_path
if not os.path.exists(dockerfile_path):
# Fall back to the deprecated approach if the user-specified
# dockerfile_path does not exist
dockerfile_path = os.path.join(project_dir, FLAGS.dockerfile_path)
extra_pip_specs = (FLAGS.extra_pip_specs or '').replace(';', '')
docker_base_cmd = 'docker build --no-cache --pull'
# FLAGS.extra_docker_build_args will be a list of strings (e.g. ['a', 'b=c']).
# We treat the strings directly as build-args: --build-arg a --build-arg b=c
# Empty strings are ignored.
extra_docker_build_args = ' '.join([
'--build-arg %s' % arg for arg in FLAGS.extra_docker_build_args if arg])
cmd = '{docker_base_cmd} -t {docker_tag}{tf_pip}{local_tf_pip}{extra_pip}{extra_docker_build_args} {suffix}'.format(
docker_base_cmd=docker_base_cmd,
docker_tag=FLAGS.docker_tag,
tf_pip=(
' --build-arg tensorflow_pip_spec={}'.format(
FLAGS.tensorflow_pip_spec) if FLAGS.tensorflow_pip_spec else ''),
# local_tensorflow_pip_spec is either string 'EMPTY' or basename of
# local .whl file.
local_tf_pip=' --build-arg local_tensorflow_pip_spec={}'.format(
local_tensorflow_pip_spec),
extra_pip=' --build-arg extra_pip_specs=\'{}\''.format(extra_pip_specs),
extra_docker_build_args=' ' + extra_docker_build_args,
suffix=(
'-f {} {}'.format(dockerfile_path, docker_context)
if docker_context else '- < {}'.format(dockerfile_path))
)
utils.run_commands([cmd])
logging.info('Built docker image with tag %s', FLAGS.docker_tag)
setup_execution_time['build_docker'] = time.time() - docker_start_time
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
perfzero_config.add_setup_parser_arguments(parser)
FLAGS, unparsed = parser.parse_known_args()
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
level=logging.DEBUG)
if unparsed:
logging.error('Arguments %s are not recognized', unparsed)
sys.exit(1)
setup_execution_time = {}
project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
workspace_dir = os.path.join(project_dir, FLAGS.workspace)
site_package_dir = os.path.join(workspace_dir, 'site-packages')
utils.copy_and_rename_dirs(FLAGS.site_package_downloads,
site_package_dir)
activate_gcloud = False
if FLAGS.dockerfile_path and FLAGS.dockerfile_path.startswith('gs://'):
# We might end up doing gsutil fetch later, so need to call
# active_gcloud_service().
activate_gcloud = True
if FLAGS.tensorflow_pip_spec and FLAGS.tensorflow_pip_spec.startswith('gs://'):
activate_gcloud = True
# Download gcloud auth token. Remove this operation in the future when
# docker in Kokoro can accesss the GCP metadata server
start_time = time.time()
utils.active_gcloud_service(FLAGS.gcloud_key_file_url,
workspace_dir, download_only=not activate_gcloud)
setup_execution_time['download_token'] = time.time() - start_time
# Set up the raid array.
start_time = time.time()
device_utils.create_drive_from_devices(FLAGS.root_data_dir,
FLAGS.gce_nvme_raid)
setup_execution_time['create_drive'] = time.time() - start_time
if FLAGS.dockerfile_path:
if FLAGS.dockerfile_path.endswith('.tar.gz'):
logging.info('Assuming given file %s is a docker image to load',
FLAGS.dockerfile_path)
_load_docker_image(FLAGS, workspace_dir,
setup_execution_time)
else:
_create_docker_image(FLAGS, project_dir, workspace_dir,
setup_execution_time)
logging.info('Setup time in seconds by operation:\n %s',
json.dumps(setup_execution_time, indent=2))
[
{
"name": "execution_timestamp",
"type": "TIMESTAMP",
"mode": "REQUIRED"
},
{
"name": "execution_id",
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "ml_framework_info",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "benchmark_result",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "benchmark_info",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "setup_info",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "system_info",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "process_info",
"type": "STRING",
"mode": "NULLABLE"
}
]
#!/usr/bin/env bash
#
# Steps:
#
# 1. Download corresponding html file for some README.md:
# curl -s $1
#
# 2. Discard rows where no substring 'user-content-' (github's markup):
# awk '/user-content-/ { ...
#
# 3.1 Get last number in each row like ' ... </span></a>sitemap.js</h1'.
# It's a level of the current header:
# substr($0, length($0), 1)
#
# 3.2 Get level from 3.1 and insert corresponding number of spaces before '*':
# sprintf("%*s", substr($0, length($0), 1)*3, " ")
#
# 4. Find head's text and insert it inside "* [ ... ]":
# substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
#
# 5. Find anchor and insert it inside "(...)":
# substr($0, match($0, "href=\"[^\"]+?\" ")+6, RLENGTH-8)
#
gh_toc_version="0.6.0"
gh_user_agent="gh-md-toc v$gh_toc_version"
#
# Download rendered into html README.md by its url.
#
#
gh_toc_load() {
local gh_url=$1
if type curl &>/dev/null; then
curl --user-agent "$gh_user_agent" -s "$gh_url"
elif type wget &>/dev/null; then
wget --user-agent="$gh_user_agent" -qO- "$gh_url"
else
echo "Please, install 'curl' or 'wget' and try again."
exit 1
fi
}
#
# Converts local md file into html by GitHub
#
# ➥ curl -X POST --data '{"text": "Hello world github/linguist#1 **cool**, and #1!"}' https://api.github.com/markdown
# <p>Hello world github/linguist#1 <strong>cool</strong>, and #1!</p>'"
gh_toc_md2html() {
local gh_file_md=$1
URL=https://api.github.com/markdown/raw
if [ -z "$GH_TOC_TOKEN" ]; then
TOKEN=$GH_TOC_TOKEN
else
TOKEN="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt"
fi
if [ -f "$TOKEN" ]; then
URL="$URL?access_token=$(cat $TOKEN)"
fi
# echo $URL 1>&2
OUTPUT="$(curl -s --user-agent "$gh_user_agent" \
--data-binary @"$gh_file_md" -H "Content-Type:text/plain" \
$URL)"
if [ "$?" != "0" ]; then
echo "XXNetworkErrorXX"
fi
if [ "$(echo "${OUTPUT}" | awk '/API rate limit exceeded/')" != "" ]; then
echo "XXRateLimitXX"
else
echo "${OUTPUT}"
fi
}
#
# Is passed string url
#
gh_is_url() {
case $1 in
https* | http*)
echo "yes";;
*)
echo "no";;
esac
}
#
# TOC generator
#
gh_toc(){
local gh_src=$1
local gh_src_copy=$1
local gh_ttl_docs=$2
local need_replace=$3
if [ "$gh_src" = "" ]; then
echo "Please, enter URL or local path for a README.md"
exit 1
fi
# Show "TOC" string only if working with one document
if [ "$gh_ttl_docs" = "1" ]; then
echo "Table of Contents"
echo "================="
echo ""
gh_src_copy=""
fi
if [ "$(gh_is_url "$gh_src")" == "yes" ]; then
gh_toc_load "$gh_src" | gh_toc_grab "$gh_src_copy"
if [ "${PIPESTATUS[0]}" != "0" ]; then
echo "Could not load remote document."
echo "Please check your url or network connectivity"
exit 1
fi
if [ "$need_replace" = "yes" ]; then
echo
echo "!! '$gh_src' is not a local file"
echo "!! Can't insert the TOC into it."
echo
fi
else
local rawhtml=$(gh_toc_md2html "$gh_src")
if [ "$rawhtml" == "XXNetworkErrorXX" ]; then
echo "Parsing local markdown file requires access to github API"
echo "Please make sure curl is installed and check your network connectivity"
exit 1
fi
if [ "$rawhtml" == "XXRateLimitXX" ]; then
echo "Parsing local markdown file requires access to github API"
echo "Error: You exceeded the hourly limit. See: https://developer.github.com/v3/#rate-limiting"
TOKEN="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt"
echo "or place github auth token here: $TOKEN"
exit 1
fi
local toc=`echo "$rawhtml" | gh_toc_grab "$gh_src_copy"`
echo "$toc"
if [ "$need_replace" = "yes" ]; then
local ts="<\!--ts-->"
local te="<\!--te-->"
local dt=`date +'%F_%H%M%S'`
local ext=".orig.${dt}"
local toc_path="${gh_src}.toc.${dt}"
local toc_footer="<!-- Added by: `whoami`, at: `date --iso-8601='minutes'` -->"
# http://fahdshariff.blogspot.ru/2012/12/sed-mutli-line-replacement-between-two.html
# clear old TOC
sed -i${ext} "/${ts}/,/${te}/{//!d;}" "$gh_src"
# create toc file
echo "${toc}" > "${toc_path}"
echo -e "\n${toc_footer}\n" >> "$toc_path"
# insert toc file
if [[ "`uname`" == "Darwin" ]]; then
sed -i "" "/${ts}/r ${toc_path}" "$gh_src"
else
sed -i "/${ts}/r ${toc_path}" "$gh_src"
fi
echo
echo "!! TOC was added into: '$gh_src'"
echo "!! Origin version of the file: '${gh_src}${ext}'"
echo "!! TOC added into a separate file: '${toc_path}'"
echo
fi
fi
}
#
# Grabber of the TOC from rendered html
#
# $1 — a source url of document.
# It's need if TOC is generated for multiple documents.
#
gh_toc_grab() {
# if closed <h[1-6]> is on the new line, then move it on the prev line
# for example:
# was: The command <code>foo1</code>
# </h1>
# became: The command <code>foo1</code></h1>
sed -e ':a' -e 'N' -e '$!ba' -e 's/\n<\/h/<\/h/g' |
# find strings that corresponds to template
grep -E -o '<a.*id="user-content-[^"]*".*</h[1-6]' |
# remove code tags
sed 's/<code>//g' | sed 's/<\/code>//g' |
# now all rows are like:
# <a id="user-content-..." href="..."><span ...></span></a> ... </h1
# format result line
# * $0 — whole string
# * last element of each row: "</hN" where N in (1,2,3,...)
echo -e "$(awk -v "gh_url=$1" '{
level = substr($0, length($0), 1)
text = substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
href = substr($0, match($0, "href=\"[^\"]+?\"")+6, RLENGTH-7)
print sprintf("%*s", level*3, " ") "* [" text "](" gh_url href ")" }' |
sed 'y/+/ /; s/%/\\x/g')"
}
#
# Returns filename only from full path or url
#
gh_toc_get_filename() {
echo "${1##*/}"
}
#
# Options hendlers
#
gh_toc_app() {
local app_name=$(basename $0)
local need_replace="no"
if [ "$1" = '--help' ] || [ $# -eq 0 ] ; then
echo "GitHub TOC generator ($app_name): $gh_toc_version"
echo ""
echo "Usage:"
echo " $app_name [--insert] src [src] Create TOC for a README file (url or local path)"
echo " $app_name - Create TOC for markdown from STDIN"
echo " $app_name --help Show help"
echo " $app_name --version Show version"
return
fi
if [ "$1" = '--version' ]; then
echo "$gh_toc_version"
echo
echo "os: `lsb_release -d | cut -f 2`"
echo "kernel: `cat /proc/version`"
echo "shell: `$SHELL --version`"
echo
for tool in curl wget grep awk sed; do
printf "%-5s: " $tool
echo `$tool --version | head -n 1`
done
return
fi
if [ "$1" = "-" ]; then
if [ -z "$TMPDIR" ]; then
TMPDIR="/tmp"
elif [ -n "$TMPDIR" -a ! -d "$TMPDIR" ]; then
mkdir -p "$TMPDIR"
fi
local gh_tmp_md
gh_tmp_md=$(mktemp $TMPDIR/tmp.XXXXXX)
while read input; do
echo "$input" >> "$gh_tmp_md"
done
gh_toc_md2html "$gh_tmp_md" | gh_toc_grab ""
return
fi
if [ "$1" = '--insert' ]; then
need_replace="yes"
shift
fi
for md in "$@"
do
echo ""
gh_toc "$md" "$#" "$need_replace"
done
echo ""
echo "Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)"
}
#
# Entry point
#
gh_toc_app "$@"
#!/usr/bin/python
#
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Plot graph showing process metric values over time"""
from __future__ import print_function
import argparse
import sys
import json
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf as backend_pdf
import matplotlib.ticker as tick
colors=['b', 'r', 'g', 'c', 'pink']
def visualize(file_path):
entries = []
with open(file_path) as f:
entries = [json.loads(line) for line in f.readlines() if line.strip()]
if not entries:
print('There is no data in file {}'.format(file_path))
return
pdf = backend_pdf.PdfPages("process_info.pdf")
idx = 0
names = [name for name in entries[0].keys() if name != 'time']
times = [entry['time'] for entry in entries]
for name in names:
values = [entry[name] for entry in entries]
fig = plt.figure()
ax = plt.gca()
ax.yaxis.set_major_formatter(tick.ScalarFormatter(useMathText=True))
plt.ticklabel_format(style='sci', axis='y', scilimits=(-2,3))
plt.plot(times, values, colors[idx % len(colors)], marker='x', label=name)
plt.xlabel('Time (sec)')
plt.ylabel(name)
plt.ylim(ymin=0)
plt.legend(loc = 'upper left')
pdf.savefig(fig)
idx += 1
plt.show()
pdf.close()
print('Generated process_info.pdf from {}'.format(file_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser(usage='plot_process_info.py <path_to_file>' )
parser.add_argument('file_path', type=str)
flags = parser.parse_args(sys.argv[1:])
visualize(flags.file_path)
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
APP=" python3 ./scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_format=NCHW --batch_size=128 --model=resnet50 --optimizer=momentum --variable_update=horovod --print_training_accuracy=true --eval_during_training_every_n_epochs=1 --nodistortions --num_gpus=1 --num_epochs=90 --weight_decay=1e-4 --data_dir=$data_dir_path --use_fp16=False --data_name=imagenet --train_dir=$save_checkpoint_path
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Benchmarks the all-reduce algorithms of tf_cnn_benchmarks.
tf_cnn_benchmarks uses all-reduce to aggregate gradients. This benchmark is
useful for benchmarking the performance of just this gradient aggregation,
instead of the entire model. All the flags that tf_cnn_benchmarks accepts are
also accepted by this script, although many are silently ignored.
The number and shapes of the tensors all-reduced are those of the variables of
the model specified by the --model flag.
TODO(reedwm): Allow custom sizes to be specified.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from absl import app
from absl import flags as absl_flags
import tensorflow.compat.v1 as tf
from tensorflow.python.ops import control_flow_ops
import benchmark_cnn
import cnn_util
import flags
from cnn_util import log_fn
absl_flags.DEFINE_integer('iters_per_step', 5,
'Number of iterations to run all-reduce for, per '
'step. Every step, a session will be run on a Graph '
'that contains this many copies of the all-reduce. '
'The copies are run sequentially. Setting this above '
'1 is useful to lower the overhead of starting the '
'session run, running the VariableV2 ops at the '
'start of the step, etc.')
flags.define_flags()
for name in flags.param_specs.keys():
absl_flags.declare_key_flag(name)
def get_var_shapes(model):
"""Returns the list of variable shapes for a tf_cnn_benchmarks Model."""
with tf.Graph().as_default():
# The variable shapes do not depend on the batch size.
images = tf.placeholder(tf.float32, model.get_input_shapes('train')[0])
model.build_network([images])
return [[int(d) for d in v.shape.dims] for v in tf.trainable_variables()]
def all_reduce(all_device_tensors, variable_mgr):
"""Performs a single batch all-reduce.
Args:
all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
a tensor, where t is the tower the tensor is on and i is the index of
the tensor.
variable_mgr: The VariableMgr to perform the all-reduce.
Returns:
List of list of tensors in the same form as `all_device_tensors`, except the
tensors are aggregated across towers.
"""
tower_grads = [[(g, None) for g in device_tensors] for
device_tensors in all_device_tensors]
_, aggregated_tower_grads = variable_mgr.preprocess_device_grads(tower_grads)
return [
[g for g, _ in agg_device_tensors]
for agg_device_tensors in aggregated_tower_grads]
def build_all_reduce_iterations(all_device_tensors, tower_devices, variable_mgr,
num_iters):
"""Builds the all-reduce ops for multiple iterations to aggregate tensors.
The tensors in `all_device_tensors` are aggregated `num_iters` times. Each
iteration aggregates the results from the previous iteration. The iterations
are run sequentially, so the aggregations for an iteration do not start
running until the previous iteration has completed. Each iteration after the
first is aggregating already-aggregated values, but it does not matter because
we are only aggregating for benchmarking purposes.
Args:
all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
a tensor, where t is the tower the tensor is on and i is the index of
the tensor.
tower_devices: A list of device strings. tower_devices[t] is the device
of the tensors in all_device_tensors[t].
variable_mgr: The VariableMgr to perform the all-reduce.
num_iters: Number of iterations to aggregate tensors for.
Returns:
An op that when run, causes the all-reduce ops to run.
"""
for i in range(num_iters):
with tf.name_scope('iteration_%d' % i):
# Step 1: Do the aggregation.
with tf.name_scope('tensor_aggregation'):
all_device_tensors = all_reduce(all_device_tensors, variable_mgr)
# Step 2. Create identity ops, to bring the aggregated results back to
# each device.
new_all_device_tensors = []
for device, device_tensors in zip(tower_devices, all_device_tensors):
with tf.device(device):
new_all_device_tensors.append([
tf.identity(t, name='identity_after_allreduce')
for t in device_tensors
])
all_device_tensors = new_all_device_tensors
# Step 3. Add control dependencies to delay the next iteration until this
# iteration is complete. To avoid extra overhead, we do not have any
# cross-device control dependencies, which means it's possible for two
# iterations to slightly overlap.
new_all_device_tensors = []
for device_tensors in all_device_tensors:
new_all_device_tensors.append([
control_flow_ops.with_dependencies(
device_tensors, t, name='identity_after_dependencies')
for t in device_tensors
])
all_device_tensors = new_all_device_tensors
# To prevent the dependency optimizer from removing every op we created,
# we store the results in variables.
ops_to_run = []
for device, device_tensors in zip(tower_devices, all_device_tensors):
with tf.device(device):
for t in device_tensors:
# The placeholder initial value is never run.
var = tf.Variable(tf.placeholder(tf.float32, t.shape), collections=[])
ops_to_run.append(var.assign(t))
return tf.group(*ops_to_run)
def build_graph(tower_devices, tensor_shapes, variable_mgr, num_iters):
"""Builds the graph for the benchmark.
Args:
tower_devices: A list of device strings of the devices to run the all-reduce
benchmark on.
tensor_shapes: A list of shapes of the tensors that will be aggregated for
the all-reduce.
variable_mgr: The VariableMgr to perform the all-reduce.
num_iters: Number of iterations to aggregate tensors for.
Returns:
An op that runs the benchmark.
"""
all_device_tensors = []
for i, tower_device in enumerate(tower_devices):
with tf.device(tower_device):
device_tensors = []
for j, shape in enumerate(tensor_shapes):
tensor = tf.Variable(tf.random_normal(shape, dtype=tf.float32),
name='tensor_%d_on_device_%d' % (j, i))
device_tensors.append(tensor)
all_device_tensors.append(device_tensors)
log_fn('Building all-reduce ops')
benchmark_op = build_all_reduce_iterations(all_device_tensors, tower_devices,
variable_mgr, num_iters)
log_fn('Done building all-reduce ops')
return benchmark_op
def run_graph(benchmark_op, bench_cnn, init_ops, dummy_loss_op):
"""Runs the graph for the benchmark.
Args:
benchmark_op: An op that runs the benchmark.
bench_cnn: The BenchmarkCNN where params and other attributes are obtained.
init_ops: A list of ops that are run before `benchmark_op` for
initialization.
dummy_loss_op: Any op. We must pass a loss op to
`benchmark_cnn.benchmark_one_step`, but the result of the op is never
actually used.
"""
config = benchmark_cnn.create_config_proto(bench_cnn.params)
with tf.Session(config=config) as sess:
for op in init_ops:
sess.run(op)
step_train_times = []
fetches = {'average_loss': dummy_loss_op, 'benchmark_op': benchmark_op}
log_fn('Running warmup')
for i in range(-bench_cnn.num_warmup_batches, bench_cnn.num_batches):
if i == 0:
log_fn('Running all-reduce ops')
start = time.perf_counter()
if i > 0 and i % bench_cnn.params.display_every == 0:
log_fn('Iteration: %d. Average time per step so far: %s' %
(i, (time.perf_counter() - start) / i))
# Call benchmark_one_step instead of directly calling sess.run(...), to
# potentially get a trace file, partitioned graphs, etc.
benchmark_cnn.benchmark_one_step(
sess=sess,
fetches=fetches,
step=i,
# The batch size is only used for the images/sec calculation, which is
# not actually calculated because we pass show_images_per_sec=False.
batch_size=None,
step_train_times=step_train_times,
trace_filename=bench_cnn.trace_filename,
partitioned_graph_file_prefix=(
bench_cnn.params.partitioned_graph_file_prefix),
profiler=None,
image_producer=None,
params=bench_cnn.params,
show_images_per_sec=False)
log_fn('Average time per step: %s' %
((time.perf_counter() - start) / bench_cnn.num_batches))
def run_benchmark(bench_cnn, num_iters):
"""Runs the all-reduce benchmark.
Args:
bench_cnn: The BenchmarkCNN where params, the variable manager, and other
attributes are obtained.
num_iters: Number of iterations to do all-reduce for for.
Raises:
ValueError: Invalid params of bench_cnn.
"""
if bench_cnn.params.variable_update != 'replicated':
raise ValueError('--variable_update=replicated must be specified to use'
'the all-reduce benchmark')
if bench_cnn.params.variable_consistency == 'relaxed':
raise ValueError('--variable_consistency=relaxed is not supported')
benchmark_op = build_graph(bench_cnn.raw_devices,
get_var_shapes(bench_cnn.model),
bench_cnn.variable_mgr, num_iters)
init_ops = [
tf.global_variables_initializer(),
bench_cnn.variable_mgr.get_post_init_ops()
]
loss_op = tf.no_op()
if bench_cnn.graph_file:
path, filename = os.path.split(bench_cnn.graph_file)
as_text = filename.endswith('txt')
log_fn('Writing GraphDef as %s to %s' % (
'text' if as_text else 'binary', bench_cnn.graph_file))
tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
path, filename, as_text)
run_graph(benchmark_op, bench_cnn, init_ops, loss_op)
# TODO(reedwm): Reduce redundancy with tf_cnn_benchmarks
def main(positional_arguments):
# Command-line arguments like '--distortions False' are equivalent to
# '--distortions=True False', where False is a positional argument. To prevent
# this from silently running with distortions, we do not allow positional
# arguments.
assert len(positional_arguments) >= 1
if len(positional_arguments) > 1:
raise ValueError('Received unknown positional arguments: %s'
% positional_arguments[1:])
params = benchmark_cnn.make_params_from_flags()
params = benchmark_cnn.setup(params)
bench = benchmark_cnn.BenchmarkCNN(params)
tfversion = cnn_util.tensorflow_version_tuple()
log_fn('TensorFlow: %i.%i' % (tfversion[0], tfversion[1]))
run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
if __name__ == '__main__':
tf.disable_v2_behavior()
app.run(main) # Raises error on invalid flags, unlike tf.app.run()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for allreduce."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections as pycoll
import re
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf
# pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
try:
from tensorflow.python.distribute.v1 import all_reduce
except ImportError:
# Compatibility with TF 2.4 and below
from tensorflow.python.distribute import all_reduce
from tensorflow.python.framework import device as pydev
from tensorflow.python.framework import ops
from tensorflow.python.ops import collective_ops
AllReduceSpecTuple = pycoll.namedtuple('AllReduceSpecTuple', 'alg shards limit')
def parse_general_int(s):
"""Parse integer with power-of-2 suffix eg. 32k."""
mo = re.match(r'(\d+)([KkMGT]?)$', s)
if mo:
i, suffix = mo.group(1, 2)
v = int(i)
if suffix:
if suffix == 'K' or suffix == 'k':
v *= 1024
elif suffix == 'M':
v *= (1024 * 1024)
elif suffix == 'G':
v *= (1024 * 1024 * 1024)
elif suffix == 'T':
v *= (1024 * 1024 * 1024 * 1024)
else:
raise ValueError('invalid integer string %s' % s)
return v
else:
v = int(s)
return v
def parse_all_reduce_spec(all_reduce_spec):
"""Parse all_reduce_spec.
Args:
all_reduce_spec: a string specifying a combination of all-reduce
algorithms to apply for gradient reduction.
Returns:
a list of AllReduceSpecTuple.
Raises:
ValueError: all_reduce_spec is not well-formed.
An all_reduce_spec has BNF form:
int ::= positive whole number
g_int ::= int[KkMGT]?
alg_spec ::= alg | alg#int
range_spec ::= alg_spec | alg_spec/alg_spec
spec ::= range_spec | range_spec:g_int:range_spec
Not all syntactically correct specifications are supported.
Examples of supported all_reduce_spec strings, with semantics explained:
'collective' == apply tf.collective_reduce operator to all tensors.
'collective#2' == apply tf.collective_reduce operator to all tensors,
requesting up to 2 simultaneous transfers at each node, if
feasible, by subdividing tensor by an additional factor of 2.
'xring' == apply ring all-reduce to all tensors
'xring#2' == apply ring all-reduce to all tensors, using two simultaneous
transfer rings, each operating on 1/2 of each tensor.
'nccl' == apply NCCL all-reduce to all tensors (only works within
a single worker process where all devices are GPUs)
'nccl/xring' == apply NCCL all-reduce to all tensors within each worker
to produce at least one full-reduced (locally) value,
then apply ring all-reduce to one such value from each
worker, then apply NCCL broadcast to propagate those globally
reduced values back to every device within each worker.
'pscpu' == Shuffle reduce using worker CPUs as the gather devices: each
distributed tensor is reduced by copying all instances to
one of the worker CPUs, computing the reduction there, then
copying back to each participating device. Tensor reductions
are assigned to specific CPUs round-robin.
'psgpu#4' == Arrange all GPUs across all workers into groups of 4.
Each distributed tensor is shuffle reduced against one
such group of 4 GPUs, selected round-robin. That is, each
tensor is split across 4 shards for the reduction.
'pscpu:2k:pscpu#2:64k:xring' == Apply single-shard pscpu to
tensors of size <= 2048 elements, apply 2-shard pscpu to
tensors up to size 64k elements, apply xring to larger tensors.
'pscpu/pscpu#2' == Use shuffle gather to locally reduce each tensor on
the worker's CPU, then use 2-shard shuffle to reduce those
locally reduced tensors across workers (on the worker CPUs), then
scatter the globally reduced values locally from each worker CPU.
"""
range_parts = all_reduce_spec.split(':') + ['-1']
if len(range_parts) % 2:
raise ValueError('all_reduce_spec not well formed: %s' % all_reduce_spec)
limit = 0
spec = []
alg = None
shards = 1
for i, range_part in enumerate(range_parts):
if i % 2 == 1:
try:
limit = parse_general_int(range_part)
spec.append(AllReduceSpecTuple(alg=alg, shards=shards, limit=limit))
except ValueError:
raise ValueError('all_reduce_spec (%s) contains non-integer range %s' %
(all_reduce_spec, range_part))
else:
alg = range_part
alg_parts = range_part.split('#')
alg = alg_parts[0]
if len(alg_parts) > 1:
try:
shards = int(alg_parts[1])
except ValueError:
raise ValueError('all_reduce_spec (%s) contains non-integer '
'shards %s' % all_reduce_spec, alg_parts[1])
else:
shards = 1
if alg not in [
'nccl', 'nccl/xring', 'nccl/rechd', 'nccl/pscpu', 'xring', 'pscpu',
'psgpu', 'pscpu/pscpu', 'collective'
]:
raise ValueError('all_reduce_spec (%s) contains invalid alg %s' %
(all_reduce_spec, alg))
return spec
def build_all_reduce_device_prefixes(job_name, num_tasks):
"""Build list of device prefix names for all_reduce.
Args:
job_name: 'worker', 'ps' or 'localhost'.
num_tasks: number of jobs across which device names should be generated.
Returns:
A list of device name prefix strings. Each element spells out the full
host name without adding the device.
e.g. '/job:worker/task:0'
"""
if job_name != 'localhost':
return ['/job:%s/task:%d' % (job_name, d) for d in range(0, num_tasks)]
else:
assert num_tasks == 1
return ['/job:%s' % job_name]
def group_device_names(devices, group_size):
"""Group device names into groups of group_size.
Args:
devices: list of strings naming devices.
group_size: int >= 1
Returns:
list of lists of devices, where each inner list is group_size long,
and each device appears at least once in an inner list. If
len(devices) % group_size = 0 then each device will appear
exactly once.
Raises:
ValueError: group_size > len(devices)
"""
num_devices = len(devices)
if group_size > num_devices:
raise ValueError('only %d devices, but group_size=%d' % (num_devices,
group_size))
num_groups = (
num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
groups = [[] for i in range(num_groups)]
for i in range(0, num_groups * group_size):
groups[i % num_groups].append(devices[i % num_devices])
return groups
def split_grads_by_size(threshold_size, device_grads):
"""Break gradients into two sets according to tensor size.
Args:
threshold_size: int size cutoff for small vs large tensor.
device_grads: List of lists of (gradient, variable) tuples. The outer
list is over devices. The inner list is over individual gradients.
Returns:
small_grads: Subset of device_grads where shape is <= theshold_size
elements.
large_grads: Subset of device_grads where shape is > threshold_size
elements.
"""
small_grads = []
large_grads = []
for dl in device_grads:
small_dl = []
large_dl = []
for (g, v) in dl:
tensor_size = g.get_shape().num_elements()
if tensor_size <= threshold_size:
small_dl.append([g, v])
else:
large_dl.append([g, v])
if small_dl:
small_grads.append(small_dl)
if large_dl:
large_grads.append(large_dl)
return small_grads, large_grads
_instance_key = 1
def new_collective_instance_key():
"""Returns a new instance key for use in defining a collective op."""
global _instance_key
v = _instance_key
_instance_key += 1
return v
_group_key = 1
_group_key_table = dict()
def collective_group_key(devices):
"""Returns a group key for the set of devices.
Args:
devices: list of strings naming devices in a collective group.
Returns:
int key uniquely identifying the set of device names.
"""
global _group_key
global _group_key_table
parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
concat = ','.join(names)
if concat not in _group_key_table.keys():
new_key = _group_key
_group_key += 1
_group_key_table[concat] = new_key
rv = _group_key_table[concat]
return rv
def build_collective_reduce(input_tensors, num_workers, num_shards,
red_op='Add', un_op='Id'):
"""Build a subgraph that does one full all-reduce, using the collective Op.
Args:
input_tensors: tensors within a single worker graph that are to be reduced
together; must be one per device.
num_workers: total number of workers with identical independent graphs that
will be doing this same reduction. The reduction will actually include
the corresponding tensors at all these workers.
num_shards: number of shards into which to divide each per-tick chunk,
normally 1 but could be higher on multi-data-path architectures.
red_op: string naming the reduction op
un_op: string naming the unary final op
Returns:
An array of final tensors, one per device, computed by the full reduction.
Raises:
ValueError: There must be at least two tensors over all the workers.
"""
group_size = len(input_tensors) * num_workers
if group_size < 2:
raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
devices = [t.device for t in input_tensors]
num_devices = len(devices)
group_key = collective_group_key(devices)
instance_key = new_collective_instance_key()
out_tensors = []
if num_shards == 1:
subdiv_offsets = [0]
elif num_shards == 2:
if num_devices > 1:
subdiv_offsets = [0, -(num_devices // 2)]
else:
subdiv_offsets = [0]
else:
raise ValueError('Unsupported num_shards %d' % num_shards)
for d in range(num_devices):
with ops.device(devices[d]):
reduce_op = collective_ops.all_reduce(input_tensors[d],
group_size, group_key, instance_key,
red_op, un_op,
subdiv_offsets)
out_tensors.append(reduce_op)
return out_tensors
def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
return collective_ops.broadcast_send(t, shape, dtype, group_size, group_key,
instance_key)
def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
return collective_ops.broadcast_recv(shape, dtype, group_size, group_key,
instance_key)
def sum_grad_and_var_all_reduce(single_session,
grad_and_vars,
num_workers,
alg,
gpu_indices,
aux_devices=None,
num_shards=1):
"""Apply all-reduce algorithm over specified gradient tensors."""
scaled_grads = [g for g, _ in grad_and_vars]
if alg == 'collective':
assert not single_session
summed_grads = build_collective_reduce(
scaled_grads, num_workers, num_shards, 'Add', 'Id')
else:
with tf.name_scope('allreduce'):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
if alg == 'nccl':
summed_grads = all_reduce.build_nccl_all_reduce(scaled_grads, tf.add)
elif alg == 'xring':
summed_grads = all_reduce.build_ring_all_reduce(
scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
elif alg == 'nccl/xring':
summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
tf.add)
elif alg == 'nccl/rechd':
summed_grads = all_reduce.build_nccl_then_recursive_hd(
scaled_grads, tf.add)
elif alg == 'nccl/pscpu':
summed_grads = all_reduce.build_nccl_then_shuffle(
scaled_grads, aux_devices, tf.add, tf.add_n)
elif alg == 'pscpu/pscpu':
summed_grads = all_reduce.build_shuffle_then_shuffle(
scaled_grads,
aux_devices,
# TODO(tucker): devise a way of better specifying the device set
# for the second level.
[aux_devices[0]],
tf.add_n)
elif alg in ['pscpu', 'psgpu']:
summed_grads = all_reduce.build_shuffle_all_reduce(
scaled_grads, aux_devices, tf.add_n)
else:
raise ValueError('unsupported all_reduce alg: ', alg)
result = []
for (_, v), g in zip(grad_and_vars, summed_grads):
result.append([g, v])
return result
def contains_any(haystack, needles):
"""Tests if any needle is a substring of haystack.
Args:
haystack: a string
needles: list of strings
Returns:
True if any element of needles is a substring of haystack,
False otherwise.
"""
for n in needles:
if n in haystack:
return True
return False
def sum_gradients_all_reduce(single_session,
dev_prefixes,
tower_grads,
num_workers,
alg,
num_shards,
gpu_indices,
agg_small_grads_max_bytes=0,
agg_small_grads_max_group=10,
allreduce_merge_scope=1):
"""Apply all-reduce algorithm over specified gradient tensors.
Args:
single_session: true if reduction is applied to one graph across
all workers, false if ths application is to a single-worker graph only.
dev_prefixes: list of prefix strings to use to generate PS device names.
tower_grads: the gradients to reduce.
num_workers: number of worker processes across entire job.
alg: the all-reduce algorithm to apply.
num_shards: alg-specific sharding factor.
gpu_indices: indices of local GPUs in order usable for ring-reduce.
agg_small_grads_max_bytes: largest tensor eligible for aggregation,
in number of bytes.
agg_small_grads_max_group: largest permitted aggregation of small
tensors.
allreduce_merge_scope: size of groups into which to partition consecutive
gradients grouped under a common 'allreduce' name scope for application
of ScopedAllocator optimization.
Returns:
list of reduced tensors
"""
alg_contains_shuffle = contains_any(alg, ['pscpu', 'psgpu'])
is_hierarchical = '/' in alg
if 'pscpu' in alg:
aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
elif 'psgpu' in alg:
aux_devices = [
prefix + '/gpu:%d' % i
for i in range(len(gpu_indices))
for prefix in dev_prefixes
]
else:
aux_devices = ['/job:localhost/cpu:0']
aux_device_groups = group_device_names(
aux_devices,
num_shards if (alg != 'collective' and alg_contains_shuffle) else 1)
group_index = 0
if agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
tower_grads, packing = pack_small_tensors(
tower_grads,
max_bytes=agg_small_grads_max_bytes,
max_group=agg_small_grads_max_group)
else:
packing = None
reduced_gv_list = []
gv = list(zip(*tower_grads))
merge_scope = allreduce_merge_scope if allreduce_merge_scope > 0 else 1
chunked_gv = [gv[x:x + merge_scope]
for x in xrange(0, len(gv), merge_scope)]
for chunk in chunked_gv:
with tf.name_scope('allreduce'):
for grad_and_vars in chunk:
reduced_gv_list.append(sum_grad_and_var_all_reduce(
single_session,
grad_and_vars, num_workers, alg, gpu_indices,
(aux_devices if is_hierarchical
else aux_device_groups[group_index]),
num_shards))
group_index = (group_index + 1) % len(aux_device_groups)
new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
if packing:
new_tower_grads = unpack_small_tensors(new_tower_grads, packing)
return new_tower_grads
def extract_ranges(index_list, range_size_limit=32):
"""Extract consecutive ranges and singles from index_list.
Args:
index_list: List of monotone increasing non-negative integers.
range_size_limit: Largest size range to return. If a larger
consecutive range exists it will be returned as multiple
ranges.
Returns:
ranges, singles where ranges is a list of [first, last] pairs of
consecutive elements in index_list, and singles is all of the
other elements, in original order.
"""
if not index_list:
return [], []
first = index_list[0]
last = first
ranges = []
singles = []
for i in index_list[1:]:
if i == last + 1 and (last - first) <= range_size_limit:
last = i
else:
if last > first:
ranges.append([first, last])
else:
singles.append(first)
first = i
last = i
if last > first:
ranges.append([first, last])
else:
singles.append(first)
return ranges, singles
GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
def pack_range(key, packing, grad_vars, rng):
"""Form the concatenation of a specified range of gradient tensors.
Args:
key: Value under which to store meta-data in packing that will be used
later to restore the grad_var list structure.
packing: Dict holding data describing packed ranges of small tensors.
grad_vars: List of (grad, var) pairs for one tower.
rng: A pair of integers giving the first, last indices of a consecutive
range of tensors to be packed.
Returns:
A tensor that is the concatenation of all the specified small tensors.
"""
to_pack = grad_vars[rng[0]:rng[1] + 1]
members = []
variables = []
restore_shapes = []
with tf.name_scope('pack'):
for g, v in to_pack:
variables.append(v)
restore_shapes.append(g.shape)
with tf.device(g.device):
members.append(tf.reshape(g, [-1]))
packing[key] = GradPackTuple(
indices=range(rng[0], rng[1] + 1),
vars=variables,
shapes=restore_shapes)
with tf.device(members[0].device):
return tf.concat(members, 0)
def unpack_grad_tuple(gv, gpt):
"""Unpack a previously packed collection of gradient tensors.
Args:
gv: A (grad, var) pair to be unpacked.
gpt: A GradPackTuple describing the packing operation that produced gv.
Returns:
A list of (grad, var) pairs corresponding to the values that were
originally packed into gv, maybe following subsequent operations like
reduction.
"""
elt_widths = [x.num_elements() for x in gpt.shapes]
with tf.device(gv[0][0].device):
with tf.name_scope('unpack'):
splits = tf.split(gv[0], elt_widths)
unpacked_gv = []
for idx, s in enumerate(splits):
unpacked_gv.append((tf.reshape(s, gpt.shapes[idx]), gpt.vars[idx]))
return unpacked_gv
def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
"""Concatenate small gradient tensors together for reduction.
Args:
tower_grads: List of lists of (gradient, variable) tuples.
max_bytes: Int giving max number of bytes in a tensor that
may be considered small.
max_group: Int giving max number of small tensors that may be
concatenated into one new tensor.
Returns:
new_tower_grads, packing where new_tower_grads is identical to
tower_grads except that all feasible small_tensors have been removed
from their places and concatenated into larger tensors that are
now in the front of the list for each tower, and packing contains
the data necessary to restore the tower_grads structure.
Look through the first tower for gradients of the same type (float),
and small size, that are all sequential. For each such group,
replace by a new tensor that is a flattened concatenation. Note
that the corresponding variable will be absent, which doesn't matter
because it isn't used during all-reduce.
Requires:
Every gv_list in towers must have isomorphic structure including identical
tensor sizes and types.
"""
small_indices = []
large_indices = []
for idx, (g, _) in enumerate(tower_grads[0]):
if g.dtype == tf.float32 and (4 * g.shape.num_elements()) <= max_bytes:
small_indices.append(idx)
else:
large_indices.append(idx)
small_ranges, small_singles = extract_ranges(
small_indices, range_size_limit=max_group)
large_indices = sorted(large_indices + small_singles)
num_gv = len(tower_grads[0])
packing = {}
if small_ranges:
new_tower_grads = []
for dev_idx, gv_list in enumerate(tower_grads):
assert len(gv_list) == num_gv
new_gv_list = []
for r in small_ranges:
key = '%d:%d' % (dev_idx, len(new_gv_list))
new_gv_list.append((pack_range(key, packing, gv_list, r),
'packing_var_placeholder'))
for i in large_indices:
new_gv_list.append(gv_list[i])
new_tower_grads.append(new_gv_list)
return new_tower_grads, packing
else:
return tower_grads, None
def unpack_small_tensors(tower_grads, packing):
"""Undo the structure alterations to tower_grads done by pack_small_tensors.
Args:
tower_grads: List of List of (grad, var) tuples.
packing: A dict generated by pack_small_tensors describing the changes
it made to tower_grads.
Returns:
new_tower_grads: identical to tower_grads except that concatentations
of small tensors have been split apart and returned to their original
positions, paired with their original variables.
"""
if not packing:
return tower_grads
new_tower_grads = []
num_devices = len(tower_grads)
num_packed = len(packing.keys()) // num_devices
for dev_idx, gv_list in enumerate(tower_grads):
new_gv_list = gv_list[num_packed:]
for i in xrange(0, num_packed):
k = '%d:%d' % (dev_idx, i)
gpt = packing[k]
gv = unpack_grad_tuple(gv_list[i], gpt)
for gi, idx in enumerate(gpt.indices):
assert idx == gpt.indices[gi]
new_gv_list.insert(idx, gv[gi])
new_tower_grads.append(new_gv_list)
return new_tower_grads
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for tf_cnn_benchmark.allreduce."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections as pycoll
import numpy as np
import tensorflow.compat.v1 as tf
from tensorflow.python.framework import ops
from tensorflow.python.framework import test_util
from tensorflow.python.ops import variables
import allreduce
class AllReduceTest(tf.test.TestCase):
def testGroupKey(self):
d0 = ['/job:worker/replica:0/task:0/device:GPU:1',
'/job:worker/replica:0/task:0/device:GPU:0',
'/job:worker/replica:0/task:0/device:GPU:3',]
d1 = ['/job:worker/replica:0/task:1/device:GPU:1',
'/job:worker/replica:0/task:1/device:GPU:0',
'/job:worker/replica:0/task:1/device:GPU:3',]
d2 = ['/job:worker/replica:0/task:1/device:GPU:1',
'/job:worker/replica:0/task:1/device:GPU:3',
'/job:worker/replica:0/task:1/device:GPU:0',]
d3 = ['/job:worker/replica:0/task:1/device:GPU:1',
'/job:worker/replica:0/task:1/device:GPU:3',
'/job:worker/replica:0/task:1/device:GPU:2',]
d4 = ['/job:worker/task:0/device:GPU:1',
'/job:worker/task:0/device:GPU:2',
'/job:worker/task:0/device:GPU:3',]
d5 = ['/job:worker/task:0/device:CPU:1',
'/job:worker/task:0/device:CPU:2']
d6 = ['/job:worker/task:0/device:CPU:2',
'/job:worker/task:0/device:CPU:1']
g0 = allreduce.collective_group_key(d0)
g1 = allreduce.collective_group_key(d1)
g2 = allreduce.collective_group_key(d2)
g3 = allreduce.collective_group_key(d3)
g4 = allreduce.collective_group_key(d4)
g5 = allreduce.collective_group_key(d5)
g6 = allreduce.collective_group_key(d6)
self.assertEqual(g0, g1)
self.assertEqual(g0, g2)
self.assertNotEqual(g0, g3)
self.assertEqual(g3, g4)
self.assertEqual(g5, g6)
self.assertNotEqual(g4, g5)
def testExtractRanges(self):
x = []
expected_ranges = []
expected_singles = []
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
x = [1, 3, 4, 6, 7, 8, 9]
expected_ranges = [[3, 4], [6, 9]]
expected_singles = [1]
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
x = [1, 2, 3, 4, 6, 7, 8, 9]
expected_ranges = [[1, 4], [6, 9]]
expected_singles = []
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
x = [1, 3, 4, 6, 7, 9]
expected_ranges = [[3, 4], [6, 7]]
expected_singles = [1, 9]
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
x = [1, 3, 6, 9]
expected_ranges = []
expected_singles = [1, 3, 6, 9]
ranges, singles = allreduce.extract_ranges(x)
self.assertEqual(expected_ranges, ranges)
self.assertEqual(expected_singles, singles)
def testPackRange(self):
packing = {}
t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
gv = [(t0, 'v0'), (t1, 'v1')]
new_t = allreduce.pack_range('0:0', packing, gv, [0, 1])
self.assertEqual(1, new_t.shape.ndims)
self.assertEqual(8, new_t.shape.dims[0])
self.assertEqual(
packing, {
'0:0':
allreduce.GradPackTuple(
indices=range(2),
vars=['v0', 'v1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])])
})
t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
gv = [(t0, 'v0'), (t1, 'v1'), (t2, 'v2'), (t3, 'v3')]
packing = {}
new_t = allreduce.pack_range('1:0', packing, gv, [0, 3])
self.assertEqual(1, new_t.shape.ndims)
self.assertEqual(26, new_t.shape.dims[0])
self.assertEqual(
packing, {
'1:0':
allreduce.GradPackTuple(
indices=range(4),
vars=['v0', 'v1', 'v2', 'v3'],
shapes=[
tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3]),
tf.TensorShape([3, 3])
])
})
def testUnpackGradTuple(self):
packing = {
'0:0':
allreduce.GradPackTuple(
indices=range(4),
vars=['v0', 'v1', 'v2', 'v3'],
shapes=[
tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3]),
tf.TensorShape([3, 3])
])
}
tc = tf.constant([0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7, 8,
0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
packed_gv = [tc, 'packing_var_placeholder']
gv = allreduce.unpack_grad_tuple(packed_gv, packing['0:0'])
self.assertLen(gv, 4)
self.assertEqual('v0', gv[0][1])
self.assertEqual('v1', gv[1][1])
self.assertEqual('v2', gv[2][1])
self.assertEqual('v3', gv[3][1])
self.assertEqual(1, gv[0][0].shape.ndims)
self.assertEqual(4, gv[0][0].shape.dims[0])
self.assertEqual(1, gv[1][0].shape.ndims)
self.assertEqual(4, gv[1][0].shape.dims[0])
self.assertEqual(2, gv[2][0].shape.ndims)
self.assertEqual(3, gv[2][0].shape.dims[0])
self.assertEqual(3, gv[2][0].shape.dims[1])
def testPackSmallTensors(self):
t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
tower_grads = []
for d in range(0, 3):
gv = [(t0, 'v_%d_0' % d), (t1, 'v_%d_1' %d), (t2, 'v_%d_2' %d),
(t3, 'v_%d_3' % d)]
tower_grads.append(gv)
# 1) Set the size limit so small that nothing gets concatenated.
new_tower_grads, packing = allreduce.pack_small_tensors(
tower_grads, max_bytes=12,
max_group=10)
self.assertEqual(tower_grads, new_tower_grads)
self.assertIs(packing, None)
# 2) Set the size limit so only the first two tensors get concatenated
new_tower_grads, packing = allreduce.pack_small_tensors(
tower_grads, max_bytes=16, # 16 bytes == 4 elements
max_group=10)
self.assertLen(new_tower_grads, 3)
self.assertLen(tower_grads[0], 4)
first_tower = new_tower_grads[0]
self.assertLen(first_tower, 3)
self.assertEqual(1, first_tower[0][0].shape.ndims)
self.assertEqual(8, first_tower[0][0].shape.dims[0])
self.assertEqual(packing,
{'0:0': allreduce.GradPackTuple(
indices=range(2),
vars=['v_0_0', 'v_0_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])]),
'1:0': allreduce.GradPackTuple(
indices=range(2),
vars=['v_1_0', 'v_1_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])]),
'2:0': allreduce.GradPackTuple(
indices=range(2),
vars=['v_2_0', 'v_2_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])])})
# 3) Set the size limit so all tensors get concatenated
new_tower_grads, packing = allreduce.pack_small_tensors(
tower_grads, max_bytes=256, # bytes = 64 elements
max_group=10)
self.assertLen(new_tower_grads, 3)
self.assertLen(tower_grads[0], 4)
self.assertLen(new_tower_grads[0], 1)
first_tower = new_tower_grads[0]
self.assertEqual(1, first_tower[0][0].shape.ndims)
self.assertEqual(26, first_tower[0][0].shape.dims[0])
self.assertEqual(packing,
{'0:0': allreduce.GradPackTuple(
indices=range(4),
vars=['v_0_0', 'v_0_1', 'v_0_2', 'v_0_3'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])]),
'1:0': allreduce.GradPackTuple(
indices=range(4),
vars=['v_1_0', 'v_1_1', 'v_1_2', 'v_1_3'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])]),
'2:0': allreduce.GradPackTuple(
indices=range(4),
vars=['v_2_0', 'v_2_1', 'v_2_2', 'v_2_3'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4]),
tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])])})
def testUnpackSmallTensors(self):
packing = {'0:0': allreduce.GradPackTuple(indices=range(2),
vars=['v_0_0', 'v_0_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])]),
'0:1': allreduce.GradPackTuple(indices=range(3, 5),
vars=['v_0_3', 'v_0_4'],
shapes=[tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])]),
'1:0': allreduce.GradPackTuple(indices=range(2),
vars=['v_1_0', 'v_1_1'],
shapes=[tf.TensorShape([4]),
tf.TensorShape([4])]),
'1:1': allreduce.GradPackTuple(indices=range(3, 5),
vars=['v_1_3', 'v_1_4'],
shapes=[tf.TensorShape([3, 3,]),
tf.TensorShape([3, 3,])])}
t0 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7], dtype=tf.float32)
t1 = tf.constant([17, 17], dtype=tf.float32)
t2 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7, 8,
0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
t3 = tf.constant([0], dtype=tf.float32)
tower_grads = []
for d in range(0, 2):
one_tower = [(t0, 'packing_var_placeholder'),
(t2, 'packing_var_placeholder'),
(t1, 'v_%d_2' % d), (t3, 'v_%d_5' %d)]
tower_grads.append(one_tower)
new_tower_grads = allreduce.unpack_small_tensors(tower_grads, packing)
self.assertLen(new_tower_grads, 2)
for d, tg in enumerate(new_tower_grads):
self.assertLen(tg, 6)
self.assertEqual('v_%d_0' % d, tg[0][1])
self.assertEqual('v_%d_1' % d, tg[1][1])
self.assertEqual('v_%d_2' % d, tg[2][1])
self.assertEqual('v_%d_3' % d, tg[3][1])
self.assertEqual('v_%d_4' % d, tg[4][1])
self.assertEqual('v_%d_5' % d, tg[5][1])
self.assertEqual(1, tg[0][0].shape.ndims)
self.assertEqual(4, tg[0][0].shape.dims[0])
self.assertEqual(1, tg[1][0].shape.ndims)
self.assertEqual(4, tg[1][0].shape.dims[0])
self.assertEqual(1, tg[2][0].shape.ndims)
self.assertEqual(2, tg[2][0].shape.dims[0])
self.assertEqual(2, tg[3][0].shape.ndims)
self.assertEqual(3, tg[3][0].shape.dims[0])
self.assertEqual(3, tg[3][0].shape.dims[1])
self.assertEqual(2, tg[4][0].shape.ndims)
self.assertEqual(3, tg[4][0].shape.dims[0])
self.assertEqual(3, tg[4][0].shape.dims[1])
self.assertEqual(1, tg[5][0].shape.ndims)
self.assertEqual(1, tg[5][0].shape.dims[0])
class DynamicPackingTest(test_util.TensorFlowTestCase):
"""Packing/Unpacking tests that require executing a TensorFlow session."""
def _init_tensors(self, num_towers, tensor_shapes):
"""Construct a collection of tensors across multiple devices."""
num_tensors = len(tensor_shapes)
consts = []
tensors = []
vrbls = []
tower_grads = []
tf.Variable([-1], dtype=tf.int32, name='packing_var_placeholder')
for dev_idx in range(0, num_towers):
devname = '/job:localhost/device:GPU:%d' % dev_idx
consts.append([])
tensors.append([])
vrbls.append([])
with tf.device(devname):
base_value = 0
gv_tuples = []
for t_idx in range(0, num_tensors):
shape = tensor_shapes[t_idx]
num_elts = 0
for d in shape:
num_elts = (num_elts or 1) * d
c = np.fromiter(range(base_value, base_value + num_elts),
dtype=np.float32).reshape(shape)
base_value += num_elts
consts[dev_idx].append(c)
tensors[dev_idx].append(tf.constant(c))
vrbls[dev_idx].append(
tf.Variable(c, name='v_d%d_t%d' % (dev_idx, t_idx)))
gv_tuples.append((tensors[dev_idx][-1], vrbls[dev_idx][-1]))
tower_grads.append(gv_tuples)
return tower_grads, consts, tensors, vrbls
_test_tuple = pycoll.namedtuple('_test_tuple',
'num_devices, in_shapes out_shapes out_i')
def _do_pack_unpack_test(self, tt):
"""Do a single pack-unpack test.
Args:
tt: A _test_tuple defining the parameters of the test to do.
This test executes a graph that performs a pack of tower_grads
followed by an unpack and verifies that the shapes and values
of gradient tensors are unchanged, along with paired variables.
"""
with ops.Graph().as_default():
tower_grads, consts, _, vrbls = self._init_tensors(
tt.num_devices, tt.in_shapes)
packed_tg, packing = allreduce.pack_small_tensors(
tower_grads, max_bytes=40, max_group=10)
unpacked_tg = allreduce.unpack_small_tensors(packed_tg, packing)
with self.test_session() as sess:
sess.run(variables.global_variables_initializer())
packed = sess.run(packed_tg)
for d in range(0, tt.num_devices):
for t in range(0, len(tt.out_shapes)):
num_elts = 0
for dim in tt.out_shapes[t]:
num_elts = (num_elts or 1) * dim
self.assertTrue(np.array_equal(
np.array(range(tt.out_i[t], tt.out_i[t] + num_elts),
dtype=np.float32).reshape(tt.out_shapes[t]),
packed[d][t][0]))
unpacked = sess.run(unpacked_tg)
for d in range(0, tt.num_devices):
for t in range(0, len(tt.in_shapes)):
self.assertTrue(np.array_equal(consts[d][t], unpacked[d][t][0]))
self.assertEqual(vrbls[d][t], unpacked_tg[d][t][1])
def testPackUnpack0(self):
self._do_pack_unpack_test(
self._test_tuple(num_devices=3,
in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
out_shapes=[[17], [12], [5, 5, 5]],
out_i=[0, 17, 29]))
def testPackUnpack1(self):
self._do_pack_unpack_test(
self._test_tuple(num_devices=4,
in_shapes=[[5, 5, 5], [2, 3], [5]],
out_shapes=[[11], [5, 5, 5]],
out_i=[125, 0]))
def testPackUnpack2(self):
self._do_pack_unpack_test(
self._test_tuple(num_devices=2,
in_shapes=[[5, 5, 5], [2, 3], [1, 5], [7], [100]],
out_shapes=[[18], [5, 5, 5], [100]],
out_i=[125, 0, 143]))
def _do_all_reduce_pack_test(self, tt):
"""Test that all-reduce results are the same with or without packing."""
with ops.Graph().as_default():
tower_grads, consts, _, _ = self._init_tensors(
tt.num_devices, tt.in_shapes)
dev_prefixes = ['/job:localhost']
num_workers = 1
alg = 'xring'
shards = 1
single_session = True
gpu_indices = range(0, tt.num_devices)
assert len(gpu_indices) == len(tower_grads)
no_pack_all_reduce = allreduce.sum_gradients_all_reduce(
single_session,
dev_prefixes, tower_grads, num_workers, alg, shards,
gpu_indices,
agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
packed_tg, packing = allreduce.pack_small_tensors(tower_grads, 100, 100)
packed_all_reduce = allreduce.sum_gradients_all_reduce(
single_session,
dev_prefixes, packed_tg, num_workers, alg, shards,
gpu_indices,
agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
unpacked_tg = allreduce.unpack_small_tensors(packed_all_reduce, packing)
with self.test_session() as sess:
sess.run(variables.global_variables_initializer())
no_pack_values = sess.run(no_pack_all_reduce)
pack_unpack_values = sess.run(unpacked_tg)
for d in range(1, tt.num_devices):
for t in range(0, len(tt.in_shapes)):
self.assertTrue(np.allclose(no_pack_values[d][t][0],
tt.num_devices * consts[0][t]))
self.assertTrue(np.array_equal(no_pack_values[d][t][0],
pack_unpack_values[d][t][0]))
def testAllReducePacked0(self):
self._do_all_reduce_pack_test(
self._test_tuple(num_devices=3,
in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
out_shapes=[[17], [12], [5, 5, 5]],
out_i=[0, 17, 29]))
def testAllReducePacked1(self):
self._do_all_reduce_pack_test(
self._test_tuple(num_devices=2,
in_shapes=[[8], [3, 3], [12], [5, 5, 5], [3], [4]],
out_shapes=[[17], [7], [12], [5, 5, 5]],
out_i=[0, 17, 29, 154, 157]))
if __name__ == '__main__':
tf.disable_v2_behavior()
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains classes and functions for doing a single-machine batch all-reduce.
An all-reduce is taking the reduction (typically a sum) of a list of tensors,
each on a different device. The result must end up back on each device, which is
where the word "all" comes from. In summary, each device starts with a single
tensor, and ends up with the reduction of all tensors.
A batch all-reduce is doing several independent all-reduces. When doing a batch
all-reduce, care is taken to evenly distribute the reduction computations
across devices and inter-device tensor transfers across device links.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# TODO(reedwm): Support distributed all-reduces in this file.
# TODO(reedwm): Merge this code with allreduce.py, which contains some batch
# all-reduce code that this file calls. allreduce.py also supports distributed
# batch-reduce while this file only supports single-machine all-reduce.
import abc
import six
import tensorflow.compat.v1 as tf
from tensorflow.python.ops import data_flow_ops
import allreduce
import constants
def _all_reduce_using_copy(tensors_across_devices, use_mean):
"""Does an all-reduce of a list of tensors by copying to the current device.
The tensors are copied to the current device and then reduced.
Args:
tensors_across_devices: A list of tensors, each on a different device.
use_mean: Whether to take the mean of the tensors instead of a sum:
Returns:
A reduced tensor on the current device.
"""
reduced_tensor = tf.add_n(tensors_across_devices)
if use_mean:
reduced_tensor *= 1 / len(tensors_across_devices)
return reduced_tensor
@six.add_metaclass(abc.ABCMeta)
class BatchAllReduceAlgorithm(object):
"""Represents an algorithm for performing a batch all-reduce operation."""
def batch_all_reduce(self,
all_device_tensors,
num_splits,
compact_tensors,
defer_tensors,
xla_compile=False):
"""Performs a batch all-reduce.
The reduction done is a sum.
`all_device_tensors` is a list of list of tensors that will be batch
all-reduced. All tensors within a single inner list must be on the same
device. The nth element in each list, for any n, will be reduced together.
The return value is in the same form as `all_device_tensors`, except that
each tensor is reduced.
For example, if `all_device_tensors` is:
[[ A, B ], # A and B are on GPU 0
[ C, D ]] # C and D are on GPU 1
Then the return value will be:
[[ A+C, B+D ], # These two tensors are on GPU 0
[ A+C, B+D ]] # These two tensors are on GPU 1
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
is a tensor where `i` is the device index and `j` is the tensor index.
num_splits: If not None, tensors will be concatenated and split into this
many pieces during the all-reduce, then split back into their original
shapes afterwards. Has no impact on correctness and can improve
performance. Requires all tensors to be the same type.
compact_tensors: If True, tensors are casted to fp16 before being all-
reduced. Improves performance, but hurts numerical stability.
defer_tensors: If True, every time the return value
`reduced_all_device_tensors` is evaluated, the result will be the
reduced tensors values of `all_device_tensors` from the previous session
run instead of the current session run, or zero on the first session
run. This can improve performance. When training neural networks,
deferring gradients often does not harm training, so this can be used to
improve performance.
xla_compile: If True, use XLA to compile gradients packing and unpacking
ops.
Returns:
reduced_all_device_tensors: A list in the same form as
`all_device_tensors`, except each tensor has been reduced.
warmup_ops: A list of ops needed to be run once before the all-reduce can
occur.
"""
# Before all-reducing tensors, we do several preprocessing functions that
# can speed up the all-reduce. We undo these functions after all-reducing
# the tensors.
# all_device_packed_tensors is a 2-d list of tensors indexed by
# [device_id][tensor_id], holding packed tensors from all devices involved
# in all-reduce.
all_device_packed_tensors = []
# all_device_warmup_ops is a 2-d list of ops indexed by
# [device_id][tensor_id], holding warmup_ops that need to be run once before
# all-reduce can occur.
all_device_warmup_ops = []
# all_device_put_ops is a 2-d list of ops indexed by
# [device_id][tensor_id], holding put ops for deferred tensors. They will be
# called in each all-reduce step automatically due to control dependency.
all_device_put_ops = []
# packers is a list of _TensorPacker, one for each device involved in
# all-reduce.
packers = [
_TensorPacker(num_splits, compact_tensors) for _ in all_device_tensors
]
for packer, device_tensors in zip(packers, all_device_tensors):
def pack_single_device_tensors(packer=packer,
device_tensors=device_tensors):
"""Pack gradient tensors of a device."""
packed_tensors = packer.maybe_concat_tensors(device_tensors)
packed_tensors = packer.maybe_compact_tensors(packed_tensors)
# When xla_compile=False, defer tensors after concat for better
# performance.
if defer_tensors and not xla_compile:
packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
packed_tensors)
all_device_put_ops.append(put_ops)
all_device_warmup_ops.append(warmup_ops)
packed_tensors = packer.maybe_split_tensors(packed_tensors)
return packed_tensors
with tf.device(device_tensors[0].device):
if xla_compile:
packed_tensors = tf.xla.experimental.compile(
pack_single_device_tensors)
# When xla_compile=True, intermediate tensors in packing process are
# not materialized. Thus, we defer tensors after packing process is
# completed instead of in the middle of it.
if defer_tensors:
packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
packed_tensors)
all_device_put_ops.append(put_ops)
all_device_warmup_ops.append(warmup_ops)
else:
packed_tensors = pack_single_device_tensors()
all_device_packed_tensors.append(packed_tensors)
# Perform all-reduce on packed tensors.
all_device_tensors = self._do_batch_all_reduce(all_device_packed_tensors)
all_device_unpacked_tensors = []
for packer, device_tensors in zip(packers, all_device_tensors):
def unpack_single_device_tensors(packer=packer,
device_tensors=device_tensors):
"""Unpack gradient tensors of a device."""
unpacked_tensors = packer.undo_maybe_split_tensors(device_tensors)
unpacked_tensors = packer.undo_maybe_compact_tensors(unpacked_tensors)
unpacked_tensors = packer.undo_maybe_concat_tensors(unpacked_tensors)
return unpacked_tensors
with tf.device(device_tensors[0].device):
if xla_compile:
unpacked_device_tensor = tf.xla.experimental.compile(
unpack_single_device_tensors)
else:
unpacked_device_tensor = unpack_single_device_tensors()
all_device_unpacked_tensors.append(unpacked_device_tensor)
# Note: There is no undo operation for deferring tensors. But we do need to
# call _add_put_op_control_deps at the end if we deferred the tensors.
if defer_tensors:
all_device_unpacked_tensors = _add_put_op_control_deps(
all_device_unpacked_tensors, num_splits, all_device_put_ops)
return all_device_unpacked_tensors, all_device_warmup_ops
@abc.abstractmethod
def _do_batch_all_reduce(self, all_device_tensors):
"""Performs a batch all-reduce.
Unlike `self.batch_all_reduce`, this does not do any preprocessing of the
tensors.
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
is a tensor where `i` is the device index and `j` is the tensor index.
Returns:
reduced_all_device_tensors: A list in the same form as
`all_device_tensors`, except each tensor has been reduced.
"""
pass
class CopyToDeviceAlgorithm(BatchAllReduceAlgorithm):
"""An algorithm that copies tensors to be reduced to a specific device."""
def __init__(self, devices_to_reduce_on, use_mean=False):
self._devices = devices_to_reduce_on
self._use_mean = use_mean
def _do_batch_all_reduce(self, all_device_tensors):
reduced_tensors = []
for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
with tf.device(self._devices[i % len(self._devices)]):
reduced_tensor = _all_reduce_using_copy(tensors_across_devices,
self._use_mean)
reduced_tensors.append(reduced_tensor)
# The tensors will be brought back to each device once they are used.
return [reduced_tensors] * len(all_device_tensors)
class HierarchicalCopyAlgorithm(BatchAllReduceAlgorithm):
"""An algorithm that uses hierarchical copies. This is only optimized for
eight devices connected in NetworkTopology.DGX1 or NetworkTopology.GCP_V100
topology.
"""
def __init__(self, network_topology):
"""Initializer for HierarchicalCopyAlgorithm.
Args:
network_topology: An instance of Enum class constants.NetworkTopology.
"""
self._network_topology = network_topology
def _do_batch_all_reduce(self, all_device_tensors):
avail_devices = [device_tensors[0].device
for device_tensors in all_device_tensors]
reduced_tensors = []
num_devices = len(avail_devices)
group_size = num_devices // 2
for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
group_0_main_device, group_1_main_device = self.__get_main_devices(
i, num_devices)
if group_0_main_device < group_size:
group_0_begin = 0
group_1_begin = group_size
else:
group_0_begin = group_size
group_1_begin = 0
# Reduce the first group.
group_0_tensors = tensors_across_devices[group_0_begin:
group_0_begin + group_size]
with tf.device(avail_devices[group_0_main_device]):
group_0_reduced_tensor = _all_reduce_using_copy(group_0_tensors, False)
# Reduce the second group.
group_1_tensors = tensors_across_devices[group_1_begin:
group_1_begin + group_size]
with tf.device(avail_devices[group_1_main_device]):
group_1_reduced_tensor = _all_reduce_using_copy(group_1_tensors, False)
# Reduce between the groups.
with tf.device(avail_devices[group_0_main_device]):
total_reduced_tensor = _all_reduce_using_copy(
[group_0_reduced_tensor, group_1_reduced_tensor], False)
# Broadcast the result back into the root of each group.
with tf.device(avail_devices[group_0_main_device]):
group_0_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
with tf.device(avail_devices[group_1_main_device]):
group_1_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
reduced_tensors_bcast = []
for j in range(len(tensors_across_devices)):
with tf.device(avail_devices[j]):
# Broadcast the result back to each member in the group from the root.
if (group_0_main_device < group_size) == (j < group_size):
src_device_tensor = group_0_reduced_tensor_bcast
else:
src_device_tensor = group_1_reduced_tensor_bcast
reduced_tensors_bcast.append(tf.identity(src_device_tensor))
reduced_tensors.append(reduced_tensors_bcast)
reduced_tensors = list(zip(*reduced_tensors))
return reduced_tensors
def __get_main_devices(self, tensor_index, num_devices):
"""Returns the pair of main devices to use for initial reduction.
Args:
tensor_index: Index of the current tensor in the list of tensors to copy.
num_devices: Total number of devices.
Returns:
A tuple containing pair of main device indices for the initial
reduction. Then, the first element of the tuple should be used for the
final reduction.
Raises:
ValueError: Invalid input arguments.
"""
if self._network_topology == constants.NetworkTopology.DGX1:
return tensor_index % num_devices, (tensor_index +
(num_devices // 2)) % num_devices
elif self._network_topology == constants.NetworkTopology.GCP_V100:
if num_devices != 8:
raise ValueError('HierarchicalCopy only supports eight devices in %s.' %
self._network_topology)
# TODO(hinsu): Generalize main device indices to handle any other
# isomorphic connection graph that connects two cliques using connections
# other than 0-5 and 2-7.
main_device_pairs = [(0, 5), (2, 7), (5, 0), (7, 2)]
return main_device_pairs[tensor_index % len(main_device_pairs)]
else:
# TODO(reedwm): make this logic more general for arbitrary topology.
raise ValueError(
'HierarchicalCopy is not supported for %s network topology.' %
self._network_topology)
class AllReduceSpecAlgorithm(BatchAllReduceAlgorithm):
"""An algorithm that uses an all reduce spec."""
def __init__(self, all_reduce_spec, gpu_indices, agg_small_grads_max_bytes,
agg_small_grads_max_group):
spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
if len(spec) != 1:
raise ValueError(
'Replicated mode does not support hybrid all-reduce strategies')
self._all_reduce_spec = spec[0]
self._gpu_indices = gpu_indices
self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
self._agg_small_grads_max_group = agg_small_grads_max_group
def _do_batch_all_reduce(self, all_device_tensors):
# TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other
# gradient aggregation code, since gradient aggregation is doing an all
# reduce. Currently, we do gradient repacking in two different places.
# TODO(reedwm): Change the allreduce code to reduce tensors instead of
# tower_grads.
tower_grads = [[(t, None) for t in device_tensors]
for device_tensors in all_device_tensors]
aggregated_device_grads = allreduce.sum_gradients_all_reduce(
False, # single_session
['/job:localhost'],
tower_grads,
1,
self._all_reduce_spec.alg,
self._all_reduce_spec.shards,
self._gpu_indices,
agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
agg_small_grads_max_group=self._agg_small_grads_max_group)
return [[t for t, _ in grad_vars] for grad_vars in aggregated_device_grads]
def algorithm_from_params(params):
"""Returns a BatchAllReduceAlgorithm from a Params tuple."""
if params.all_reduce_spec:
if params.gpu_indices:
gpu_indices = [int(x) for x in params.gpu_indices.split(',')]
else:
gpu_indices = [x for x in range(params.num_gpus)]
return AllReduceSpecAlgorithm(params.all_reduce_spec, gpu_indices,
params.agg_small_grads_max_bytes,
params.agg_small_grads_max_group)
elif params.hierarchical_copy:
return HierarchicalCopyAlgorithm(params.network_topology)
else:
if params.local_parameter_device == 'gpu':
devices_to_reduce_on = ['/gpu:%d' % i for i in range(params.num_gpus)]
else:
devices_to_reduce_on = ['/cpu:0']
return CopyToDeviceAlgorithm(devices_to_reduce_on)
def _apply_to_all_device_tensors(all_device_tensors, apply_func, colocate=True):
"""Applies a function to each tensor in `all_device_tensors`.
A new list of lists of tensors is returned, where every tensor in
`all_device_tensors` has had `apply_func` called on it. `all_device_tensors`
is not modified.
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
a tensor where `i` is the device index and `j` is the tensor index.
apply_func: A function taking in three arguments: tensor, device_index,
tensor_index, and returning a modified tensor.
`tensor` is `all_device_tensors[device_index][tensor_index]`.
colocate: If True, apply_func will be run under context manager colocated
with it's input tensor.
Returns:
A list in the same form as `all_device_tensors`, except each tensor has had
`apply_func` called on it.
"""
new_all_device_tensors = []
for device_index, device_tensors in enumerate(all_device_tensors):
new_device_tensors = []
for tensor_index, t in enumerate(device_tensors):
if colocate:
with tf.colocate_with(t):
new_t = apply_func(t, device_index, tensor_index)
else:
new_t = apply_func(t, device_index, tensor_index)
new_device_tensors.append(new_t)
new_all_device_tensors.append(new_device_tensors)
return new_all_device_tensors
def _defer_tensor(tensor):
"""Defers the retrieval of a tensor.
The tensor is put into a StagingArea, and the return value is the
retrieval of the tensor from the StagingArea. The effect is that the
tensor returned from this function is the tensor that was put in the
StagingArea for the previous Session.run() call.
Args:
tensor: The tensor to defer for one step.
Returns:
deferred_tensor: The tensor deferred for one step.
put_op: An op to put `tensor` in the StagingArea. Must be run every step
that `deferred_tensor` is run.
warmup_op: A warmup op that should be called before the first step. Puts
a zero tensor into the StagingArea.
"""
tensor_stage = data_flow_ops.StagingArea([tensor.dtype], [tensor.shape])
put_op = tensor_stage.put([tensor])
warmup_op = tensor_stage.put([tf.zeros(tensor.shape, dtype=tensor.dtype)])
# Fetch the next tensor to use.
(tensor,) = tensor_stage.get()
return tensor, put_op, warmup_op
def defer_single_device_tensors(device_tensors):
"""Defer tensors (gradients in this case) from a single device.
Args:
device_tensors: A list of gradients tensors from a single device to defer.
Returns:
deferred_tensors: A list of tensors deferred for one step.
put_ops: A list of ops that put `tensors` in the StagingAreas. Must be run
every step that `deferred_tensors` is run.
warmup_ops: Warmup ops that should be called before the first step. Puts
zero tensors into the StagingArea.
"""
put_ops = []
warmup_ops = []
deferred_tensors = []
for tensor in device_tensors:
deferred_tensor, put_op, warmup_op = _defer_tensor(tensor)
deferred_tensors.append(deferred_tensor)
put_ops.append(put_op)
warmup_ops.append(warmup_op)
return deferred_tensors, put_ops, warmup_ops
def _add_put_op_control_deps(all_device_tensors, num_splits, put_ops):
"""Add control dependencies from `put_ops` to `all_device_tensors`.
This should only be called when deferred tensors are being used.
The control dependencies are added so that the put ops are run whenever
`all_device_tensors` is run. That way, the caller does not have to explicitly
run the put ops.
Args:
all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
a tensor where `i` is the device index and `j` is the tensor index.
num_splits: The number of splits that were used for the all-reduce.
put_ops: A list of put ops from deferring the tensors.
Returns:
A list in the same form as `all_device_tensors`, except each tensor has a
control dependency on an op in `put_ops`.
"""
def apply_func(tensor, device_index, tensor_index):
if num_splits == 0:
deps = [put_ops[device_index][tensor_index]]
else:
deps = put_ops[device_index]
assert len(deps) == 1
with tf.control_dependencies(deps):
return tf.identity(tensor, name='control_dependency')
return _apply_to_all_device_tensors(all_device_tensors, apply_func)
class _TensorPacker(object):
"""Packs and unpacks tensors into groups.
This class first concatenates a set of tensors, then split the concatenated
tensor into a small number of chunks. This is useful for all-reducing tensors,
as doing a small number of all-reduces on large tensors can be faster than
doing a large number of all-reduces on small tensors.
It also provides option to compact tensors by casting them to fp16, for better
all-reduce performance.
This class maintains states of processed tensors like shapes and types. So
each packer can only be used to pack and unpack one list of tensors. If you
need to pack multiple lists of tensors (say from multiple devices), then you
need multiple _TensorPacker object, one for each device.
"""
def __init__(self, num_splits, compact):
"""Initializes the _TensorPacker.
Args:
num_splits: The number of tensors to split the concatenated tensor into.
The batch all-reduce will consist of `num_splits` all-reduces. if None
or zero, tensors are not split or concatenated.
compact: If True, tensors are casted to fp16 during packing and casted
back to their original dtypes during unpacking.
"""
self._num_splits = num_splits
self._compact = compact
self._before_compact_dtypes = []
def maybe_concat_tensors(self, device_tensors):
"""Concatenate tensors into a single tensor."""
if not self._num_splits:
return device_tensors
flat_tensors = [tf.reshape(t, [-1]) for t in device_tensors]
self._orig_shapes = [t.shape for t in device_tensors]
self._orig_sizes = [s.num_elements() for s in self._orig_shapes]
# All shapes must be fully defined.
assert None not in self._orig_sizes
concatenated_grad = tf.concat(flat_tensors, 0)
return [concatenated_grad]
def maybe_split_tensors(self, concatenated_tensor):
"""Split concatenated tensor into `num_splits` pieces."""
if not self._num_splits:
return concatenated_tensor
if len(concatenated_tensor) != 1:
raise RuntimeError('tensors must be concatenated via '
'maybe_concat_tensors() before splitting')
concatenated_tensor = concatenated_tensor[0]
total_tensor_size = concatenated_tensor.shape.num_elements()
split_size = total_tensor_size // self._num_splits
split_size_last = total_tensor_size - split_size * (self._num_splits - 1)
split_sizes = [split_size] * (self._num_splits - 1) + [split_size_last]
tensor_packs = tf.split(concatenated_tensor, split_sizes)
return tensor_packs
def undo_maybe_split_tensors(self, tensor_packs):
"""Undo maybe_split_tensors()."""
if not self._num_splits:
return tensor_packs
return [tf.concat(tensor_packs, 0)]
def undo_maybe_concat_tensors(self, concatenated_tensor):
"""Undo maybe_concat_tensors()."""
if not self._num_splits:
return concatenated_tensor
if len(concatenated_tensor) != 1:
raise RuntimeError(
'undo_maybe_split_tensors() must be called before '
'undo_maybe_concat_tensors when num_splits is greater than 1')
concatenated_tensor = concatenated_tensor[0]
tensors_with_sizes = tf.split(concatenated_tensor,
self._orig_sizes)
tensors_with_shapes = [
tf.reshape(grad, shape) for grad, shape in zip(
tensors_with_sizes, self._orig_shapes)
]
return tensors_with_shapes
def maybe_compact_tensors(self, device_tensors):
"""Cast tensors to fp16 and store their original types."""
if not self._compact:
return device_tensors
if self._before_compact_dtypes:
raise RuntimeError('maybe_compact_tensors can only be called once.')
self._before_compact_dtypes = [t.dtype for t in device_tensors]
compact_tensors = [tf.cast(t, tf.float16) for t in device_tensors]
return compact_tensors
def undo_maybe_compact_tensors(self, compact_tensors):
"""Undo maybe_compact_tensors()."""
if not self._compact:
return compact_tensors
if not self._before_compact_dtypes:
raise RuntimeError('maybe_compact_tensors() must be called before '
'undo_maybe_compact_tensors()')
device_tensors = [
tf.cast(t, dtype)
for t, dtype in zip(compact_tensors, self._before_compact_dtypes)
]
return device_tensors
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""TensorFlow benchmark library.
See the README for more information.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
from collections import namedtuple
import contextlib
import math
import multiprocessing
import os
import re
import threading
import time
import traceback
from absl import flags as absl_flags
import numpy as np
import six
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf
# pylint: disable=g-direct-tensorflow-import
import cnn_util
import constants
import datasets
import flags
import mlperf
import variable_mgr
import variable_mgr_util
from cnn_util import log_fn
from models import model_config
from platforms import util as platforms_util
from google.protobuf import text_format
from tensorflow.core.protobuf import rewriter_config_pb2
from tensorflow.python import debug as tf_debug
from tensorflow.python.client import timeline
from tensorflow.python.framework import graph_util
from tensorflow.python.framework import graph_util_impl
from tensorflow.python.framework import importer
from tensorflow.python.ops import data_flow_ops
from tensorflow.python.platform import gfile
from tensorflow.python.util import nest
_DEFAULT_NUM_BATCHES = 100
# GraphInfo encapsulates the tensors/ops that we care about after building a
# graph. We use them to benchmark the graph.
GraphInfo = namedtuple( # pylint: disable=invalid-name
'GraphInfo',
[
# Ops that produce the input batches (before preprocessing).
'input_producer_op',
# Ops that adds the preprocessed images to the staging areas
'enqueue_ops',
# Fetches of sess.run()
'fetches',
# Op that performs synchronization in distributed mode
'execution_barrier',
# The global step variable
'global_step',
# Group of ops that perform per-device initialization work
'local_var_init_op_group',
# Op to produce summaries
'summary_op'
])
# InputProcessingInfo contains various sources of inputs which will be later fed
# into the model. If synthetic data is used, all three fields are None.
InputProcessingInfo = namedtuple(
'InputProcessingInfo',
[
# The first two fields are non-None iff datasets prefetching is not
# used.
# Ops that produce the input batches.
'input_producer_op',
# A list of StagingArea for each device.
'input_producer_stages',
# Input produced using multi device iterator. Non-None iff datasets
# prefetching is used
'multi_device_iterator_input'
])
# TODO(reedwm): add upper_bound and lower_bound to appropriate integer and
# float flags, and change certain string flags to enum flags.
flags.DEFINE_string('model', 'trivial',
'Name of the model to run, the list of supported models '
'are defined in models/model.py')
# The code will first check if it's running under benchmarking mode
# or evaluation mode, depending on 'eval':
# Under the evaluation mode, this script will read a saved model,
# and compute the accuracy of the model against a validation dataset.
# Additional ops for accuracy and top_k predictors are only used under
# this mode.
# Under the benchmarking mode, user can specify whether nor not to use
# the forward-only option, which will only compute the loss function.
# forward-only cannot be enabled with eval at the same time.
flags.DEFINE_boolean('eval', False, 'whether use eval or benchmarking')
flags.DEFINE_integer('eval_interval_secs', 0,
'How often to run eval on saved checkpoints. Usually the '
'same as save_model_secs from the corresponding training '
'run. Pass 0 to eval only once.')
flags.DEFINE_integer('eval_during_training_every_n_steps', None,
'Every n steps during training, pause training, run '
'evaluation, then resume training. Must not be used with '
'--eval, as unlike --eval, this option causes both '
'training and eval to be done. This may take slightly '
'more GPU memory than running just training or evaluation '
'alone. It also may slightly slow down training, even '
'when not taking into account the additional time to '
'evaluate.', lower_bound=1)
flags.DEFINE_float('eval_during_training_every_n_epochs', None,
'After every n training epochs, pause training, run '
'evaluation, then resume training. See '
'--eval_during_training_every_n_steps for more information.')
flags.DEFINE_list('eval_during_training_at_specified_steps', [],
'Specify a list of training steps, pause training at each of '
'these steps, run evaluation, then resume training. See '
'--eval_during_training_every_n_steps for more information.')
flags.DEFINE_list('eval_during_training_at_specified_epochs', [],
'Specify a list of training epochs, pause training after '
'each of these epochs, run evaluation, then resume training. '
'See --eval_during_training_every_n_steps for more '
'information.')
flags.DEFINE_boolean('forward_only', False,
'whether use forward-only or training for benchmarking')
flags.DEFINE_boolean('freeze_when_forward_only', False,
'whether to freeze the graph when in forward-only mode.')
flags.DEFINE_boolean('print_training_accuracy', False,
'whether to calculate and print training accuracy during '
'training')
flags.DEFINE_integer('batch_size', 0, 'batch size per compute device')
flags.DEFINE_integer('eval_batch_size', 0, 'eval batch size per compute device')
flags.DEFINE_integer('batch_group_size', 1,
'number of groups of batches processed in the image '
'producer.')
flags.DEFINE_integer('num_batches', None, 'number of batches to run, excluding '
'warmup. Defaults to %d' % _DEFAULT_NUM_BATCHES)
flags.DEFINE_integer('num_eval_batches', None,
'number of eval batches to run, excluding warmup. '
'Defaults to --num_batches')
flags.DEFINE_float('num_epochs', None,
'number of epochs to run, excluding warmup. '
'This and --num_batches cannot both be specified.')
flags.DEFINE_float('num_eval_epochs', None,
'number of eval epochs to run, excluding warmup. '
'Defaults to --num_epochs')
flags.DEFINE_float('stop_at_top_1_accuracy', None,
'If set, stops training after the evaluation accuracy hits '
'this number. Can only be used with one of the '
'--eval_during_training_* flags.')
flags.DEFINE_boolean('collect_eval_results_async', False,
'If True, start a separate process to postprocess eval '
'results asynchronously. This currently only works with '
'the SSD model.')
flags.DEFINE_integer('num_warmup_batches', None,
'number of batches to run before timing')
flags.DEFINE_integer('autotune_threshold', None,
'The autotune threshold for the models')
# TODO(tucker): change num_gpus to num_devices
flags.DEFINE_integer('num_gpus', 1, 'the number of GPUs to run on')
flags.DEFINE_string('gpu_indices', '', 'indices of worker GPUs in ring order')
flags.DEFINE_integer('display_every', 10,
'Number of local steps after which progress is printed '
'out')
flags.DEFINE_float('display_perf_ewma', None,
'If set, display numbers of images/sec using exponentially '
'weighted moving avearge with the specified weight, which '
'defines how much current value contributes to the reported '
'average. Increasing weight makes the reported performance '
'number reflect more about the real-time speed instead of '
'the entire history', lower_bound=0, upper_bound=1)
flags.DEFINE_string('data_dir', None,
'Path to dataset in TFRecord format (aka Example '
'protobufs). If not specified, synthetic data will be '
'used.')
flags.DEFINE_string('data_name', None,
'Name of dataset: imagenet or cifar10. If not specified, '
'it is automatically guessed based on data_dir.')
flags.DEFINE_string('resize_method', 'bilinear',
'Method for resizing input images: crop, nearest, '
'bilinear, bicubic, area, or round_robin. The `crop` mode '
'requires source images to be at least as large as the '
'network input size. The `round_robin` mode applies '
'different resize methods based on position in a batch in '
'a round-robin fashion. Other modes support any sizes and '
'apply random bbox distortions before resizing (even with '
'distortions=False).')
flags.DEFINE_boolean('distortions', False,
'Enable/disable distortions during image preprocessing. '
'These include bbox and color distortions.')
flags.DEFINE_boolean('use_datasets', True,
'Enable use of datasets for input pipeline')
flags.DEFINE_string('input_preprocessor', 'default',
'Name of input preprocessor. The list of supported input '
'preprocessors are defined in preprocessing.py.')
flags.DEFINE_string('gpu_thread_mode', 'gpu_private',
'Methods to assign GPU host work to threads. '
'global: all GPUs and CPUs share the same global threads; '
'gpu_private: a private threadpool for each GPU; '
'gpu_shared: all GPUs share the same threadpool.')
flags.DEFINE_integer('per_gpu_thread_count', 0,
'The number of threads to use for GPU. Only valid when '
'gpu_thread_mode is not global.')
flags.DEFINE_boolean('hierarchical_copy', False,
'Use hierarchical copies. Currently only optimized for '
'use on a DGX-1 with 8 GPUs and may perform poorly on '
'other hardware. Requires --num_gpus > 1, and only '
'recommended when --num_gpus=8')
# TODO(hinsu): Support auto-detection of the network topology while still
# retaining the ability to specify a particular topology for debugging.
flags.DEFINE_enum(
'network_topology', constants.NetworkTopology.DGX1,
(constants.NetworkTopology.DGX1, constants.NetworkTopology.GCP_V100),
'Network topology specifies the topology used to connect multiple devices. '
'Network topology is used to decide the hierarchy to use for the '
'hierarchical_copy.')
flags.DEFINE_integer('gradient_repacking', 0, 'Use gradient repacking. It '
'currently only works with replicated mode. At the end of '
'each step, it repacks the gradients for more efficient '
'cross-device transportation. A non-zero value specifies '
'the number of split packs that will be formed.',
lower_bound=0)
flags.DEFINE_boolean('compact_gradient_transfer', True, 'Compact gradient'
'as much as possible for cross-device transfer and '
'aggregation.')
flags.DEFINE_enum('variable_consistency', 'strong', ('strong', 'relaxed'),
'The data consistency for trainable variables. With strong '
'consistency, the variable always have the updates from '
'previous step. With relaxed consistency, all the updates '
'will eventually show up in the variables. Likely one step '
'behind.')
flags.DEFINE_boolean('datasets_repeat_cached_sample', False,
'Enable use of a special datasets pipeline that reads a '
'single TFRecord into memory and repeats it infinitely '
'many times. The purpose of this flag is to make it '
'possible to write regression tests that are not '
'bottlenecked by CNS throughput. '
'Use datasets_use_caching to cache input data.')
flags.DEFINE_enum('local_parameter_device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
'Device to use as parameter server: cpu or gpu. For '
'distributed training, it can affect where caching of '
'variables happens.')
flags.DEFINE_enum('device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
'Device to use for computation: cpu or gpu')
flags.DEFINE_enum('data_format', 'NCHW', ('NHWC', 'NCHW'),
'Data layout to use: NHWC (TF native) or NCHW (cuDNN '
'native, requires GPU).')
flags.DEFINE_integer('num_intra_threads', None,
'Number of threads to use for intra-op parallelism. If '
'set to 0, the system will pick an appropriate number. '
'None is the same as 0 except that it disables intra-op '
'parallelism on a GPU.')
flags.DEFINE_integer('num_inter_threads', 0,
'Number of threads to use for inter-op parallelism. If '
'set to 0, the system will pick an appropriate number.')
flags.DEFINE_boolean('use_numa_affinity', False,
'Whether to turn on NUMA affinity for CPU devices. '
'This is probably only useful when --device=cpu.')
flags.DEFINE_string('trace_file', '',
'Enable TensorFlow tracing and write trace to this file.')
flags.DEFINE_boolean('use_chrome_trace_format', True,
'If True, the trace_file, if specified, will be in a '
'Chrome trace format. If False, then it will be a '
'StepStats raw proto.')
_NUM_STEPS_TO_PROFILE = 10
_NUM_OPS_TO_PRINT = 20
flags.DEFINE_string('tfprof_file', None,
'If specified, write a tfprof ProfileProto to this file. '
'The performance and other aspects of the model can then '
'be analyzed with tfprof. See '
'https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/g3doc/command_line.md ' # pylint: disable=line-too-long
'for more info on how to do this. The first %d steps '
'are profiled. Additionally, the top %d most time '
'consuming ops will be printed.\n'
'Note: profiling with tfprof is very slow, but most of the '
'overhead is spent between steps. So, profiling results '
'are more accurate than the slowdown would suggest.' %
(_NUM_STEPS_TO_PROFILE, _NUM_OPS_TO_PRINT))
flags.DEFINE_string('graph_file', None,
'Write the model\'s graph definition to this file. '
'Defaults to binary format unless filename ends in "txt".')
flags.DEFINE_string('partitioned_graph_file_prefix', None,
'If specified, after the graph has been partitioned and '
'optimized, write out each partitioned graph to a file '
'with the given prefix.')
flags.DEFINE_enum('optimizer', 'sgd', ('momentum', 'sgd', 'rmsprop', 'adam'),
'Optimizer to use')
flags.DEFINE_float('init_learning_rate', None,
'Initial learning rate for training.')
flags.DEFINE_string('piecewise_learning_rate_schedule', None,
'Specifies a piecewise learning rate schedule based on the '
'number of epochs. This is the form LR0;E1;LR1;...;En;LRn, '
'where each LRi is a learning rate and each Ei is an epoch '
'indexed from 0. The learning rate is LRi if the '
'E(i-1) <= current_epoch < Ei. For example, if this '
'paramater is 0.3;10;0.2;25;0.1, the learning rate is 0.3 '
'for the first 10 epochs, then is 0.2 for the next 15 '
'epochs, then is 0.1 until training ends.')
flags.DEFINE_float('num_epochs_per_decay', 0,
'Steps after which learning rate decays. If 0, the learning '
'rate does not decay.')
flags.DEFINE_float('learning_rate_decay_factor', 0,
'Learning rate decay factor. Decay by this factor every '
'`num_epochs_per_decay` epochs. If 0, learning rate does '
'not decay.')
flags.DEFINE_float('num_learning_rate_warmup_epochs', 0,
'Slowly increase to the initial learning rate in the first '
'num_learning_rate_warmup_epochs linearly.')
flags.DEFINE_float('minimum_learning_rate', 0,
'The minimum learning rate. The learning rate will '
'never decay past this value. Requires `learning_rate`, '
'`num_epochs_per_decay` and `learning_rate_decay_factor` to '
'be set.')
flags.DEFINE_float('resnet_base_lr', None, "Base learning rate at bs=256. Only "
"relevant when training ResNet and utilizing the model's "
"learning rate heuristic (get_learning_rate).")
flags.DEFINE_float('momentum', 0.9, 'Momentum for training.')
flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')
flags.DEFINE_float('rmsprop_momentum', 0.9, 'Momentum in RMSProp.')
flags.DEFINE_float('rmsprop_epsilon', 1.0, 'Epsilon term for RMSProp.')
flags.DEFINE_float('adam_beta1', 0.9, 'Beta2 term for the Adam optimizer')
flags.DEFINE_float('adam_beta2', 0.999, 'Beta2 term for the Adam optimizer')
flags.DEFINE_float('adam_epsilon', 1e-8, 'Epsilon term for the Adam optimizer')
flags.DEFINE_float('gradient_clip', None,
'Gradient clipping magnitude. Disabled by default.')
flags.DEFINE_float('weight_decay', 0.00004,
'Weight decay factor for training.')
flags.DEFINE_float('gpu_memory_frac_for_testing', 0,
'If non-zero, the fraction of GPU memory that will be used. '
'Useful for testing the benchmark script, as this allows '
'distributed mode to be run on a single machine. For '
'example, if there are two tasks, each can be allocated '
'~40 percent of the memory on a single machine. This is '
'also useful for using unified memory, as this can be set '
'above 1 to oversubscribe the GPU using unified memory.',
lower_bound=0.)
flags.DEFINE_boolean('use_unified_memory', None,
'If True, allocate unified memory enabling larger models '
'to fit in available device RAM.')
flags.DEFINE_boolean('timestamped_allocator', False,
'If True marks free BFCAllocator::Chunks with time '
'at which they are freed which can allow more efficient '
'memory allocation in cases like RDMA networking.')
flags.DEFINE_integer('gpu_kt_max_interval', 0,
'If > 0, the maximum number of GPU Ops that may be queued '
'in a row without also queuing a tracking event.')
flags.DEFINE_integer('gpu_kt_max_bytes', 0,
'If > 0, the maximum number of bytes '
'of GPU memory that may be allocated by sequential '
'GPU Ops without queuing a tracking event.')
flags.DEFINE_integer('gpu_kt_max_pending', 0,
'If > 0 no more than this many GPU tracking events may be '
'outstanding at any time. When this limit is reached '
'launch of additional kernels will stall until an '
'outstanding event completes.')
flags.DEFINE_boolean('use_tf_layers', True,
'If True, use tf.layers for neural network layers. This '
'should not affect performance or accuracy in any way.')
flags.DEFINE_integer('tf_random_seed', 1234,
'The TensorFlow random seed. Useful for debugging NaNs, '
'as this can be set to various values to see if the NaNs '
'depend on the seed.')
flags.DEFINE_string('debugger', None,
'If set, use the TensorFlow debugger. If set to "cli", use '
'the local CLI debugger. Otherwise, this must be in the '
'form hostname:port (e.g., localhost:7007) in which case '
'the experimental TensorBoard debugger will be used')
flags.DEFINE_boolean('use_python32_barrier', False,
'When on, use threading.Barrier at Python 3.2.')
flags.DEFINE_boolean('ml_perf', False,
'When True, change how the Imagenet input pipeline works '
'slightly to meet the MLPerf compliance rules. This slows '
'down the input pipeline. Without this option, at the end '
'of the input pipeline, the image is divided by 127.5, '
'then 1.0 is subtracted from it, bringing the image '
'values from [0, 255] to [-1.0, 1.0]. With this option, '
'each of the three channels (red, green, blue) have the '
'average channel value among all image subtracted from '
'it, and no division is done.')
flags.DEFINE_boolean('datasets_use_prefetch', True,
'Enable use of prefetched datasets for input pipeline. '
'This option is meaningless if use_datasets=False.')
flags.DEFINE_integer('datasets_prefetch_buffer_size', 1,
'Prefetching op buffer size per compute device.')
flags.DEFINE_integer('datasets_num_private_threads', None,
'Number of threads for a private threadpool created for '
'all datasets computation. By default, we pick an '
'appropriate number. If set to 0, we use the default '
'tf-Compute threads for dataset operations.')
flags.DEFINE_boolean('datasets_use_caching', False,
'Cache the compressed input data in memory. This improves '
'the data input performance, at the cost of additional '
'memory.')
flags.DEFINE_integer('datasets_parallel_interleave_cycle_length', None,
'Number of parallel file readers interleaving input data.')
flags.DEFINE_boolean('datasets_sloppy_parallel_interleave', False,
'Allow parallel interleave to depart from deterministic '
'ordering, by temporarily skipping over files whose '
'elements are not readily available. This can increase '
'througput in particular in the presence of stragglers.')
flags.DEFINE_integer('datasets_parallel_interleave_prefetch', None,
'The number of input elements to fetch before they are '
'needed for interleaving.')
flags.DEFINE_integer(
'multi_device_iterator_max_buffer_size', 1,
'Configuration parameter for the MultiDeviceIterator that '
' specifies the host side buffer size for each device.')
# Performance tuning parameters.
flags.DEFINE_boolean('winograd_nonfused', True,
'Enable/disable using the Winograd non-fused algorithms.')
flags.DEFINE_boolean(
'batchnorm_persistent', True,
'Enable/disable using the CUDNN_BATCHNORM_SPATIAL_PERSISTENT '
'mode for batchnorm.')
flags.DEFINE_boolean('sync_on_finish', False,
'Enable/disable whether the devices are synced after each '
'step.')
flags.DEFINE_boolean('staged_vars', False,
'whether the variables are staged from the main '
'computation')
flags.DEFINE_boolean('force_gpu_compatible', False,
'whether to enable force_gpu_compatible in GPU_Options')
flags.DEFINE_boolean('allow_growth', None,
'whether to enable allow_growth in GPU_Options')
flags.DEFINE_boolean('xla', False, 'whether to enable XLA auto-jit compilation')
flags.DEFINE_boolean('xla_compile', False,
'Enable xla to compile the graph. Uncompilable ops will '
'result in fatal errors.')
flags.DEFINE_boolean('fuse_decode_and_crop', True,
'Fuse decode_and_crop for image preprocessing.')
flags.DEFINE_boolean('distort_color_in_yiq', True,
'Distort color of input images in YIQ space.')
flags.DEFINE_boolean('enable_optimizations', True,
'Whether to enable grappler and other optimizations.')
flags.DEFINE_string('rewriter_config', None,
'Config for graph optimizers, described as a '
'RewriterConfig proto buffer.')
flags.DEFINE_enum('loss_type_to_report', 'total_loss',
('base_loss', 'total_loss'),
'Which type of loss to output and to write summaries for. '
'The total loss includes L2 loss while the base loss does '
'not. Note that the total loss is always used while '
'computing gradients during training if weight_decay > 0, '
'but explicitly computing the total loss, instead of just '
'computing its gradients, can have a performance impact.')
flags.DEFINE_boolean('single_l2_loss_op', False,
'If True, instead of using an L2 loss op per variable, '
'concatenate the variables into a single tensor and do a '
'single L2 loss on the concatenated tensor.')
flags.DEFINE_boolean('use_resource_vars', False,
'Use resource variables instead of normal variables. '
'Resource variables are slower, but this option is useful '
'for debugging their performance.')
flags.DEFINE_boolean('compute_lr_on_cpu', False,
'If True, do computations related to learning rate on the '
'CPU instead of the GPU. This will significantly improve '
'XLA performance in some cases.')
flags.DEFINE_boolean('sparse_to_dense_grads', False,
'If True, convert all sparse gradients to dense gradients '
'before passing them to the optimizer to update '
'variables. Only affects models with sparse gradients, '
'which currently is only the NCF model.')
# Performance tuning specific to MKL.
flags.DEFINE_boolean('mkl', False, 'If true, set MKL environment variables.')
flags.DEFINE_integer('kmp_blocktime', 0,
'The time, in milliseconds, that a thread should wait, '
'after completing the execution of a parallel region, '
'before sleeping')
flags.DEFINE_string('kmp_affinity', 'granularity=fine,verbose,compact,1,0',
'Restricts execution of certain threads (virtual execution '
'units) to a subset of the physical processing units in a '
'multiprocessor computer.')
flags.DEFINE_integer('kmp_settings', 1,
'If set to 1, MKL settings will be printed.')
# fp16 parameters. If use_fp16=False, no other fp16 parameters apply.
flags.DEFINE_boolean('use_fp16', False,
'Use 16-bit floats for certain tensors instead of 32-bit '
'floats. This is currently experimental.')
# TODO(reedwm): The default loss scale of 128 causes most models to diverge
# on the second step with synthetic data. Changing the tf.set_random_seed
# call to tf.set_random_seed(1235) or most other seed values causes the
# issue not to occur.
flags.DEFINE_float('fp16_loss_scale', None,
'If fp16 is enabled, the loss is multiplied by this amount '
'right before gradients are computed, then each gradient '
'is divided by this amount. Mathematically, this has no '
'effect, but it helps avoid fp16 underflow. Set to 1 to '
'effectively disable. Ignored during eval.')
flags.DEFINE_boolean('fp16_vars', False,
'If fp16 is enabled, also use fp16 for variables. If '
'False, the variables are stored in fp32 and casted to '
'fp16 when retrieved. Recommended to leave as False.')
flags.DEFINE_boolean('fp16_enable_auto_loss_scale', False,
'If True and use_fp16 is True, automatically adjust the '
'loss scale during training.')
flags.DEFINE_integer('fp16_inc_loss_scale_every_n', 1000,
'If fp16 is enabled and fp16_enable_auto_loss_scale is '
'True, increase the loss scale every n steps.')
# The method for managing variables:
# parameter_server: variables are stored on a parameter server that holds
# the master copy of the variable. In local execution, a local device
# acts as the parameter server for each variable; in distributed
# execution, the parameter servers are separate processes in the
# cluster.
# For each step, each tower gets a copy of the variables from the
# parameter server, and sends its gradients to the param server.
# replicated: each GPU has its own copy of the variables. To apply
# gradients, an all_reduce algorithm or or regular cross-device
# aggregation is used to replicate the combined gradients to all
# towers (depending on all_reduce_spec parameter setting).
# independent: each GPU has its own copy of the variables, and gradients
# are not shared between towers. This can be used to check performance
# when no data is moved between GPUs.
# distributed_replicated: Distributed training only. Each GPU has a copy
# of the variables, and updates its copy after the parameter servers
# are all updated with the gradients from all servers. Only works with
# cross_replica_sync=true. Unlike 'replicated', currently never uses
# nccl all-reduce for replicating within a server.
# distributed_all_reduce: Distributed training where all replicas run
# in a single session, using all-reduce to mutally reduce the
# gradients. Uses no parameter servers. When there is only one
# worker, this is the same as replicated.
# collective_all_reduce: Distributed training where all replicas run
# independepently except for variable initialization and for
# gradient reduction which is done via collective all-reduce.
# NOTE: collective_all_reduce in conjunction with use_fp16 can
# lead to NaNs in some models (resnet50). TODO(tucker): fix it.
# horovod: Distributed training using Horovod library. Runs workers using
# an MPI framework (e.g. Open MPI). Each worker runs training on
# single GPU, and averages gradients using NCCL or MPI all-reduce.
# See https://github.com/uber/horovod for more details.
flags.DEFINE_enum('variable_update', 'parameter_server',
('parameter_server', 'replicated', 'distributed_replicated',
'independent', 'distributed_all_reduce',
'collective_all_reduce', 'horovod'),
'The method for managing variables: parameter_server, '
'replicated, distributed_replicated, independent, '
'distributed_all_reduce, collective_all_reduce, horovod')
flags.DEFINE_string('all_reduce_spec', None,
'A specification of the all_reduce algorithm to be used '
'for reducing gradients. For more details, see '
'parse_all_reduce_spec in variable_mgr.py. An '
'all_reduce_spec has BNF form:\n'
'int ::= positive whole number\n'
'g_int ::= int[KkMGT]?\n'
'alg_spec ::= alg | alg#int\n'
'range_spec ::= alg_spec | alg_spec/alg_spec\n'
'spec ::= range_spec | range_spec:g_int:range_spec\n'
'NOTE: not all syntactically correct constructs are '
'supported.\n\n'
'Examples:\n '
'"xring" == use one global ring reduction for all '
'tensors\n'
'"pscpu" == use CPU at worker 0 to reduce all tensors\n'
'"nccl" == use NCCL to locally reduce all tensors. '
'Limited to 1 worker.\n'
'"nccl/xring" == locally (to one worker) reduce values '
'using NCCL then ring reduce across workers.\n'
'"pscpu:32k:xring" == use pscpu algorithm for tensors of '
'size up to 32kB, then xring for larger tensors.')
# If variable_update==distributed_all_reduce then it may be advantageous
# to aggregate small tensors into one prior to reduction. These parameters
# control that aggregation.
flags.DEFINE_integer('agg_small_grads_max_bytes', 0,
'If > 0, try to aggregate tensors of less than this '
'number of bytes prior to all-reduce.')
flags.DEFINE_integer('agg_small_grads_max_group', 10,
'When aggregating small tensors for all-reduce do not '
'aggregate more than this many into one new tensor.')
flags.DEFINE_integer('allreduce_merge_scope', 1,
'Establish a name scope around this many '
'gradients prior to creating the all-reduce operations. '
'It may affect the ability of the backend to merge '
'parallel ops.')
# Distributed training parameters.
flags.DEFINE_enum('job_name', '', ('ps', 'worker', 'controller', ''),
'One of "ps", "worker", "controller", "". Empty for local '
'training')
flags.DEFINE_string('ps_hosts', '', 'Comma-separated list of target hosts')
flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of target hosts')
flags.DEFINE_string('controller_host', None, 'optional controller host')
flags.DEFINE_integer('task_index', 0, 'Index of task within the job')
flags.DEFINE_string('server_protocol', 'grpc', 'protocol for servers')
flags.DEFINE_boolean('cross_replica_sync', True, '')
flags.DEFINE_string('horovod_device', '', 'Device to do Horovod all-reduce on: '
'empty (default), cpu or gpu. Default with utilize GPU if '
'Horovod was compiled with the HOROVOD_GPU_ALLREDUCE '
'option, and CPU otherwise.')
# Summary and Save & load checkpoints.
flags.DEFINE_integer('summary_verbosity', 0, 'Verbosity level for summary ops. '
'level 0: disable any summary.\n'
'level 1: small and fast ops, e.g.: learning_rate, '
'total_loss.\n'
'level 2: medium-cost ops, e.g. histogram of all '
'gradients.\n'
'level 3: expensive ops: images and histogram of each '
'gradient.\n')
flags.DEFINE_integer('save_summaries_steps', 0,
'How often to save summaries for trained models. Pass 0 '
'to disable summaries.')
flags.DEFINE_integer('save_model_secs', 0,
'How often to save trained models. Pass 0 to disable '
'saving checkpoints every N seconds. A checkpoint is '
'saved after training completes regardless of this '
'option.')
flags.DEFINE_integer('save_model_steps', None,
'How often to save trained models. If specified, '
'save_model_secs must not be specified.')
flags.DEFINE_integer('max_ckpts_to_keep', 5,
'Max number of checkpoints to keep.')
flags.DEFINE_string('train_dir', None,
'Path to session checkpoints. Pass None to disable saving '
'checkpoint at the end.')
flags.DEFINE_string('eval_dir', '/tmp/tf_cnn_benchmarks/eval',
'Directory where to write eval event logs.')
flags.DEFINE_string('backbone_model_path', None,
'Path to pretrained backbone model checkpoint. Pass None '
'if not using a backbone model.')
flags.DEFINE_enum('trt_mode', '', ['', 'FP32', 'FP16', 'INT8'],
'If this is specified in forward_only mode and '
'freeze_when_forward_only is set to True, use TensorRT to '
'optimize the graph before execution.')
flags.DEFINE_integer('trt_max_workspace_size_bytes', 4 << 30,
'Max workspace size bytes used by the TensorRT optimizer.')
# Benchmark logging for model garden metric
flags.DEFINE_string('benchmark_log_dir', None,
'The directory to place the log files containing the '
'results of benchmark. The logs are created by '
'BenchmarkFileLogger. Requires the root of the Tensorflow '
'models repository to be in $PYTHTONPATH.')
flags.DEFINE_string('benchmark_test_id', None,
'The unique test ID of the benchmark run. It could be the '
'combination of key parameters. It is hardware independent '
'and could be used compare the performance between '
'different test runs. This flag is designed for human '
'consumption, and does not have any impact within the '
'system.')
platforms_util.define_platform_params()
class GlobalStepWatcher(threading.Thread):
"""A helper class for global_step.
Polls for changes in the global_step of the model, and finishes when the
number of steps for the global run are done.
"""
def __init__(self, sess, global_step_op, start_at_global_step,
end_at_global_step):
threading.Thread.__init__(self)
self.sess = sess
self.global_step_op = global_step_op
self.start_at_global_step = start_at_global_step
self.end_at_global_step = end_at_global_step
self.start_time = 0
self.start_step = 0
self.finish_time = 0
self.finish_step = 0
def run(self):
while self.finish_time == 0:
time.sleep(.25)
global_step_val, = self.sess.run([self.global_step_op])
if self.start_time == 0 and global_step_val >= self.start_at_global_step:
# Use tf.logging.info instead of log_fn, since print (which is log_fn)
# is not thread safe and may interleave the outputs from two parallel
# calls to print, which can break tests.
tf.logging.info('Starting real work at step %s at time %s' %
(global_step_val, time.ctime()))
self.start_time = time.perf_counter()
self.start_step = global_step_val
if self.finish_time == 0 and global_step_val >= self.end_at_global_step:
tf.logging.info('Finishing real work at step %s at time %s' %
(global_step_val, time.ctime()))
self.finish_time = time.perf_counter()
self.finish_step = global_step_val
def done(self):
return self.finish_time > 0
def num_steps(self):
return self.finish_step - self.start_step
def elapsed_time(self):
return self.finish_time - self.start_time
class CheckpointNotFoundException(Exception):
pass
def create_config_proto(params):
"""Returns session config proto.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
"""
config = tf.ConfigProto()
config.allow_soft_placement = True
if params.num_intra_threads is None:
if params.device == 'gpu':
config.intra_op_parallelism_threads = 1
else:
config.intra_op_parallelism_threads = params.num_intra_threads
config.inter_op_parallelism_threads = params.num_inter_threads
config.experimental.collective_group_leader = '/job:worker/replica:0/task:0'
config.gpu_options.experimental.collective_ring_order = params.gpu_indices
config.gpu_options.force_gpu_compatible = params.force_gpu_compatible
config.experimental.use_numa_affinity = params.use_numa_affinity
if params.device == 'cpu':
# TODO(tucker): change num_gpus to num_devices
config.device_count['CPU'] = params.num_gpus
if params.allow_growth is not None:
config.gpu_options.allow_growth = params.allow_growth
if params.gpu_memory_frac_for_testing > 0:
config.gpu_options.per_process_gpu_memory_fraction = (
params.gpu_memory_frac_for_testing)
if params.use_unified_memory:
config.gpu_options.experimental.use_unified_memory = (
params.use_unified_memory)
if params.timestamped_allocator:
config.gpu_options.experimental.timestamped_allocator = (
params.timestamped_allocator)
if params.gpu_kt_max_interval > 0:
config.gpu_options.experimental.kernel_tracker_max_interval = (
params.gpu_kt_max_interval)
if params.gpu_kt_max_bytes > 0:
config.gpu_options.experimental.kernel_tracker_max_bytes = (
params.gpu_kt_max_bytes)
if params.gpu_kt_max_pending > 0:
config.gpu_options.experimental.kernel_tracker_max_pending = (
params.gpu_kt_max_pending)
if params.xla:
config.graph_options.optimizer_options.global_jit_level = (
tf.OptimizerOptions.ON_1)
if params.rewriter_config:
rewriter_config = rewriter_config_pb2.RewriterConfig()
text_format.Merge(params.rewriter_config, rewriter_config)
config.graph_options.rewrite_options.CopyFrom(rewriter_config)
elif not params.enable_optimizations:
config.graph_options.optimizer_options.opt_level = tf.OptimizerOptions.L0
config.graph_options.rewrite_options.disable_meta_optimizer = True
elif params.variable_update == 'collective_all_reduce':
rewrite_options = config.graph_options.rewrite_options
rewrite_options.scoped_allocator_optimization = (
rewriter_config_pb2.RewriterConfig.ON)
rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
if params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
config.gpu_options.visible_device_list = str(hvd.local_rank())
# For collective_all_reduce, ignore all devices except current worker.
if params.variable_update == 'collective_all_reduce':
del config.device_filters[:]
config.device_filters.append(
'/job:%s/replica:0/task:%d' % (params.job_name, params.task_index))
# TODO(b/117324590): Re-enable PinToHostOptimizer when b/117324590 is fixed.
# Currently we have to disable PinToHostOptimizer w/ XLA since it causes
# OOM/perf cliffs.
config.graph_options.rewrite_options.pin_to_host_optimization = (
rewriter_config_pb2.RewriterConfig.OFF)
return config
def get_mode_from_params(params):
"""Returns the mode in which this script is running.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
Raises:
ValueError: Unsupported params settings.
"""
if params.forward_only and params.eval:
raise ValueError('Only one of forward_only and eval parameters is true')
if params.eval:
return constants.BenchmarkMode.EVAL
elif params.forward_only:
return constants.BenchmarkMode.FORWARD_ONLY
elif (params.eval_during_training_every_n_steps or
params.eval_during_training_every_n_epochs or
params.eval_during_training_at_specified_steps or
params.eval_during_training_at_specified_epochs):
return constants.BenchmarkMode.TRAIN_AND_EVAL
else:
return constants.BenchmarkMode.TRAIN
# How many digits to show for the loss and accuracies during training.
LOSS_AND_ACCURACY_DIGITS_TO_SHOW = 3
def benchmark_one_step(sess,
fetches,
step,
batch_size,
step_train_times,
trace_filename,
partitioned_graph_file_prefix,
profiler,
image_producer,
params,
summary_op=None,
show_images_per_sec=True,
benchmark_logger=None,
collective_graph_key=0,
should_output_files=True):
"""Advance one step of benchmarking."""
should_profile = profiler and 0 <= step < _NUM_STEPS_TO_PROFILE
need_options_and_metadata = (
should_profile or collective_graph_key > 0 or
((trace_filename or partitioned_graph_file_prefix) and step == -2)
)
if need_options_and_metadata:
run_options = tf.RunOptions()
if (trace_filename and step == -2) or should_profile:
run_options.trace_level = tf.RunOptions.FULL_TRACE
if partitioned_graph_file_prefix and step == -2:
run_options.output_partition_graphs = True
if collective_graph_key > 0:
run_options.experimental.collective_graph_key = collective_graph_key
run_metadata = tf.RunMetadata()
else:
run_options = None
run_metadata = None
summary_str = None
start_time = time.perf_counter()
if summary_op is None:
results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
else:
(results, summary_str) = sess.run(
[fetches, summary_op], options=run_options, run_metadata=run_metadata)
if not params.forward_only:
lossval = results['average_loss']
else:
lossval = 0.
if image_producer is not None:
image_producer.notify_image_consumption()
train_time = time.perf_counter() - start_time
step_train_times.append(train_time)
if (show_images_per_sec and step >= 0 and
(step == 0 or (step + 1) % params.display_every == 0)):
speed_mean, speed_uncertainty, speed_jitter = get_perf_timing(
batch_size, step_train_times, params.display_perf_ewma)
log_str = '%i\t%s\t%.*f' % (
step + 1,
get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter),
LOSS_AND_ACCURACY_DIGITS_TO_SHOW, lossval)
if 'top_1_accuracy' in results:
log_str += '\t%.*f\t%.*f' % (
LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_1_accuracy'],
LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_5_accuracy'])
log_fn(log_str)
if benchmark_logger:
benchmark_logger.log_metric(
'current_examples_per_sec', speed_mean, global_step=step + 1)
if 'top_1_accuracy' in results:
benchmark_logger.log_metric(
'top_1_accuracy', results['top_1_accuracy'], global_step=step + 1)
benchmark_logger.log_metric(
'top_5_accuracy', results['top_5_accuracy'], global_step=step + 1)
if need_options_and_metadata:
if should_profile:
profiler.add_step(step, run_metadata)
if trace_filename and step == -2 and should_output_files:
log_fn('Dumping trace to %s' % trace_filename)
trace_dir = os.path.dirname(trace_filename)
if not gfile.Exists(trace_dir):
gfile.MakeDirs(trace_dir)
with gfile.Open(trace_filename, 'w') as trace_file:
if params.use_chrome_trace_format:
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
trace_file.write(trace.generate_chrome_trace_format(show_memory=True))
else:
trace_file.write(str(run_metadata.step_stats))
if partitioned_graph_file_prefix and step == -2 and should_output_files:
path, filename = os.path.split(partitioned_graph_file_prefix)
if '.' in filename:
base_filename, ext = filename.rsplit('.', 1)
ext = '.' + ext
else:
base_filename, ext = filename, ''
as_text = filename.endswith('txt')
for graph_def in run_metadata.partition_graphs:
device = graph_def.node[0].device.replace('/', '_').replace(':', '_')
graph_filename = '%s%s%s' % (base_filename, device, ext)
log_fn('Writing partitioned GraphDef as %s to %s' % (
'text' if as_text else 'binary',
os.path.join(path, graph_filename)))
tf.train.write_graph(graph_def, path, graph_filename, as_text)
return (summary_str, lossval)
def get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter, scale=1):
if scale == 1:
# TODO(laigd): rename 'images' to maybe 'inputs', same below.
return ('images/sec: %.1f +/- %.1f (jitter = %.1f)' %
(speed_mean, speed_uncertainty, speed_jitter))
else:
return 'images/sec: %.1f' % speed_mean
def get_perf_timing(batch_size, step_train_times, ewma_alpha=None, scale=1):
"""Calculate benchmark processing speed."""
times = np.array(step_train_times)
speeds = batch_size / times
if ewma_alpha:
weights = np.logspace(len(times)-1, 0, len(times), base=1-ewma_alpha)
time_mean = np.average(times, weights=weights)
else:
time_mean = np.mean(times)
speed_mean = scale * batch_size / time_mean
speed_uncertainty = np.std(speeds) / np.sqrt(float(len(speeds)))
speed_jitter = 1.4826 * np.median(np.abs(speeds - np.median(speeds)))
return speed_mean, speed_uncertainty, speed_jitter
def load_checkpoint(saver, sess, ckpt_dir):
"""Loads checkpoint from provided directory or full path.
Args:
saver: Saver used to restore the checkpoint.
sess: TensorFlow session.
ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
Returns:
Global step.
"""
model_checkpoint_path = _get_checkpoint_to_load(ckpt_dir)
global_step = model_checkpoint_path.split('/')[-1].split('-')[-1]
if not global_step.isdigit():
global_step = 0
else:
global_step = int(global_step)
saver.restore(sess, model_checkpoint_path)
log_fn('Successfully loaded model from %s.' % model_checkpoint_path)
return global_step
def _get_checkpoint_to_load(ckpt_dir):
"""Returns which checkpoint to load.
Args:
ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
Returns:
Full path to checkpoint to load.
Raises:
CheckpointNotFoundException: If checkpoint is not found.
"""
p = re.compile(r'ckpt-\d+$')
if p.search(ckpt_dir):
model_checkpoint_path = ckpt_dir
else:
# Finds latest checkpoint in directory provided
ckpt = tf.train.get_checkpoint_state(ckpt_dir)
if ckpt and ckpt.model_checkpoint_path:
model_checkpoint_path = ckpt.model_checkpoint_path
else:
raise CheckpointNotFoundException('No checkpoint file found in dir:{}'.
format(ckpt_dir))
return model_checkpoint_path
# Params are passed to BenchmarkCNN's constructor. Params is a map from name
# to value, with one field per key in flags.param_specs.
#
# Call make_params() or make_params_from_flags() below to construct a Params
# tuple with default values from flags.param_specs, rather than constructing
# Params directly.
Params = namedtuple('Params', flags.param_specs.keys()) # pylint: disable=invalid-name
def validate_params(params):
"""Validates that the Params tuple had valid values.
When command-line flags are defined for each ParamSpec by calling
flags.define_flags(), calling this function is unnecessary because absl
already does flag validation. Otherwise, this function should be called.
Args:
params: A Params tuple.
Raises:
ValueError: An element of params had an invalid value.
"""
for name, value in params._asdict().items():
param_spec = flags.param_specs[name]
if param_spec.flag_type in ('integer', 'float'):
if (value is not None and param_spec.kwargs['lower_bound'] is not None and
value < param_spec.kwargs['lower_bound']):
raise ValueError('Param %s value of %s is lower than the lower bound '
'of %s' %
(name, value, param_spec.kwargs['lower_bound']))
if (value is not None and param_spec.kwargs['upper_bound'] is not None and
param_spec.kwargs['upper_bound'] < value):
raise ValueError('Param %s value of %s is higher than the upper bound '
'of %s' %
(name, value, param_spec.kwargs['upper_bound']))
elif (value is not None and param_spec.flag_type == 'enum' and
value not in param_spec.kwargs['enum_values']):
raise ValueError('Param %s of value %s is not in %s'%
(name, value, param_spec.kwargs['enum_values']))
def make_params(**kwargs):
"""Create a Params tuple for BenchmarkCNN from kwargs.
Default values are filled in from flags.param_specs.
Args:
**kwargs: kwarg values will override the default values.
Returns:
Params namedtuple for constructing BenchmarkCNN.
"""
# Create a (name: default_value) map from flags.param_specs.
default_kwargs = {
name: flags.param_specs[name].default_value
for name in flags.param_specs
}
params = Params(**default_kwargs)._replace(**kwargs)
validate_params(params)
return params
def make_params_from_flags():
"""Create a Params tuple for BenchmarkCNN from absl_flags.FLAGS.
Returns:
Params namedtuple for constructing BenchmarkCNN.
"""
# Collect (name: value) pairs for absl_flags.FLAGS with matching names in
# flags.param_specs.
flag_values = {name: getattr(absl_flags.FLAGS, name)
for name in flags.param_specs.keys()}
return Params(**flag_values)
def remove_param_fields(params, fields_to_remove):
"""Remove fields from a Params namedtuple."""
params_dict = params._asdict()
for field in fields_to_remove:
assert field in params_dict, 'Invalid Params field: ' + field
params_dict = {k: v for k, v in params_dict.items()
if k not in fields_to_remove}
new_params_type = namedtuple('Params', params_dict.keys())
return new_params_type(**params_dict)
def get_num_batches_and_epochs(params, batch_size, num_examples_per_epoch):
"""Returns the number of batches and epochs to run for.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
batch_size: The number of images per step.
num_examples_per_epoch: The number of images in a single epoch.
Returns:
num_batches: The number of batches to run for.
num_epochs: The number of epochs to run for. This might be slightly
smaller than params.num_epochs if specified, because the number of batches
must be an integer.
Raises:
ValueError: Invalid or unsupported params.
"""
if params.num_batches and params.num_epochs:
raise ValueError('At most one of --num_batches and --num_epochs may be '
'specified.')
if params.num_epochs:
num_batches = int(params.num_epochs * num_examples_per_epoch +
batch_size - 1) // batch_size
else:
num_batches = params.num_batches or _DEFAULT_NUM_BATCHES
num_epochs = num_batches * batch_size / num_examples_per_epoch
return (num_batches, num_epochs)
def get_piecewise_learning_rate(piecewise_learning_rate_schedule,
global_step, num_batches_per_epoch):
"""Returns a piecewise learning rate tensor.
Args:
piecewise_learning_rate_schedule: The --piecewise_learning_rate_schedule
parameter
global_step: Scalar tensor representing the global step.
num_batches_per_epoch: float indicating the number of batches per epoch.
Returns:
A scalar float tensor, representing the learning rate.
Raises:
ValueError: piecewise_learning_rate_schedule is not formatted correctly.
"""
pieces = piecewise_learning_rate_schedule.split(';')
if len(pieces) % 2 == 0:
raise ValueError('--piecewise_learning_rate_schedule must have an odd '
'number of components')
values = []
boundaries = []
for i, piece in enumerate(pieces):
if i % 2 == 0:
try:
values.append(float(piece))
except ValueError:
raise ValueError('Invalid learning rate: ' + piece)
else:
try:
boundaries.append(int(int(piece) * num_batches_per_epoch) - 1)
except ValueError:
raise ValueError('Invalid epoch: ' + piece)
return tf.train.piecewise_constant(global_step, boundaries, values,
name='piecewise_learning_rate')
def get_learning_rate(params, global_step, num_examples_per_epoch, model,
batch_size):
"""Returns a learning rate tensor based on global_step.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
global_step: Scalar tensor representing the global step.
num_examples_per_epoch: The number of examples per epoch.
model: The model.Model object to obtain the default learning rate from if no
learning rate is specified.
batch_size: Number of examples per step
Returns:
A scalar float tensor, representing the learning rate. When evaluated, the
learning rate depends on the current value of global_step.
Raises:
ValueError: Invalid or unsupported params.
"""
with tf.name_scope('learning_rate'):
num_batches_per_epoch = num_examples_per_epoch / batch_size
if params.piecewise_learning_rate_schedule:
if (params.init_learning_rate is not None or
params.learning_rate_decay_factor or
params.minimum_learning_rate or params.num_epochs_per_decay):
raise ValueError('No other learning rate-related flags can be '
'specified if --piecewise_learning_rate_schedule is '
'specified')
learning_rate = get_piecewise_learning_rate(
params.piecewise_learning_rate_schedule,
global_step, num_batches_per_epoch)
elif params.init_learning_rate is not None:
learning_rate = params.init_learning_rate
if (params.num_epochs_per_decay > 0 and
params.learning_rate_decay_factor > 0):
decay_steps = int(num_batches_per_epoch * params.num_epochs_per_decay)
# Decay the learning rate exponentially based on the number of steps.
learning_rate = tf.train.exponential_decay(
params.init_learning_rate,
global_step,
decay_steps,
params.learning_rate_decay_factor,
staircase=True)
if params.minimum_learning_rate != 0.:
learning_rate = tf.maximum(learning_rate,
params.minimum_learning_rate)
else:
learning_rate = model.get_learning_rate(global_step, batch_size)
if params.num_learning_rate_warmup_epochs > 0 and (
params.init_learning_rate is not None or
params.piecewise_learning_rate_schedule):
warmup_steps = int(num_batches_per_epoch *
params.num_learning_rate_warmup_epochs)
init_lr = params.init_learning_rate
if init_lr is None:
init_lr = float(params.piecewise_learning_rate_schedule.split(';')[0])
warmup_lr = init_lr * tf.cast(global_step, tf.float32) / tf.cast(
warmup_steps, tf.float32)
learning_rate = tf.cond(global_step < warmup_steps,
lambda: warmup_lr, lambda: learning_rate)
learning_rate = mlperf.logger.log_deferred_tensor_value(
mlperf.tags.OPT_LR, learning_rate, global_step, every_n=100)
return learning_rate
def get_optimizer(params, learning_rate):
"""Returns the optimizer that should be used based on params."""
if params.optimizer == 'momentum':
mlperf.logger.log(key=mlperf.tags.OPT_NAME,
value=mlperf.tags.SGD_WITH_MOMENTUM)
mlperf.logger.log(key=mlperf.tags.OPT_MOMENTUM, value=params.momentum)
opt = tf.train.MomentumOptimizer(
learning_rate, params.momentum, use_nesterov=True)
elif params.optimizer == 'sgd':
mlperf.logger.log(key=mlperf.tags.OPT_NAME, value=mlperf.tags.SGD)
opt = tf.train.GradientDescentOptimizer(learning_rate)
elif params.optimizer == 'rmsprop':
opt = tf.train.RMSPropOptimizer(
learning_rate,
params.rmsprop_decay,
momentum=params.rmsprop_momentum,
epsilon=params.rmsprop_epsilon)
elif params.optimizer == 'adam':
opt = tf.train.AdamOptimizer(learning_rate, params.adam_beta1,
params.adam_beta2, params.adam_epsilon)
else:
raise ValueError('Optimizer "{}" was not recognized'.
format(params.optimizer))
return opt
def generate_tfprof_profile(profiler, tfprof_file):
"""Generates a tfprof profile, writing it to a file and printing top ops.
Args:
profiler: A tf.profiler.Profiler. `profiler.add_step` must have already been
called.
tfprof_file: The filename to write the ProfileProto to.
"""
profile_proto = profiler.serialize_to_string()
log_fn('Dumping ProfileProto to %s' % tfprof_file)
with gfile.Open(tfprof_file, 'wb') as f:
f.write(profile_proto)
# Print out the execution times of the top operations. Note this
# information can also be obtained with the dumped ProfileProto, but
# printing it means tfprof doesn't have to be used if all the user wants
# is the top ops.
options = tf.profiler.ProfileOptionBuilder.time_and_memory()
options['max_depth'] = _NUM_OPS_TO_PRINT
options['order_by'] = 'accelerator_micros'
profiler.profile_operations(options)
class BenchmarkCNN(object):
"""Class for benchmarking a cnn network."""
def __init__(self, params, dataset=None, model=None):
"""Initialize BenchmarkCNN.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
dataset: If not None, the dataset to use. Otherwise, params is used to
obtain the dataset.
model: If not None, the model to use. Otherwise, params is used to obtain
the model.
Raises:
ValueError: Unsupported params settings.
"""
mlperf.logger.log(key=mlperf.tags.RUN_START)
self.params = params
if params.eval:
self._doing_eval = True
else:
# Note self._doing_eval can later switch to True in self._do_eval() if
# self.params.eval_during_training_* is specified.
self._doing_eval = False
self.dataset = dataset or datasets.create_dataset(self.params.data_dir,
self.params.data_name)
self.model = model or model_config.get_model_config(
self.params.model, self.dataset, self.params)
self.trace_filename = self.params.trace_file
self.rewriter_config = self.params.rewriter_config
autotune_threshold = self.params.autotune_threshold if (
self.params.autotune_threshold) else 1
min_autotune_warmup = 5 * autotune_threshold * autotune_threshold
self.num_warmup_batches = self.params.num_warmup_batches if (
self.params.num_warmup_batches is not None) else max(
10, min_autotune_warmup)
self.graph_file = self.params.graph_file
self.resize_method = self.params.resize_method
self.sync_queue_counter = 0
self.num_gpus = self.params.num_gpus
if self.params.gpu_indices:
self.gpu_indices = [int(x) for x in self.params.gpu_indices.split(',')]
else:
self.gpu_indices = [x for x in range(self.num_gpus)]
if (self.params.device == 'cpu' and self.params.data_format == 'NCHW' and
not self.params.mkl):
raise ValueError('device=cpu requires that data_format=NHWC')
if ((self.params.num_epochs_per_decay or
self.params.learning_rate_decay_factor) and
not (self.params.init_learning_rate is not None and
self.params.num_epochs_per_decay
and self.params.learning_rate_decay_factor)):
raise ValueError('If one of num_epochs_per_decay or '
'learning_rate_decay_factor is set, both must be set'
'and learning_rate must be set')
if (self.params.minimum_learning_rate and
not (self.params.init_learning_rate is not None and
self.params.num_epochs_per_decay and
self.params.learning_rate_decay_factor)):
raise ValueError('minimum_learning_rate requires learning_rate,'
'num_epochs_per_decay, and '
'learning_rate_decay_factor to be set')
if (self.params.use_fp16 and self.params.fp16_vars and
'replicated' in self.params.variable_update and
self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec):
raise ValueError('fp16 variables are not supported with NCCL')
if (self.params.use_fp16 and self.params.fp16_vars and
self.params.gradient_repacking):
raise ValueError('--fp16_vars cannot be used with --gradient_repacking')
if self.params.variable_update == 'horovod' and self.params.num_gpus > 1:
raise ValueError('Horovod benchmarks require num_gpus=1 on each worker')
if self.params.variable_update == 'horovod' and self.params.job_name:
raise ValueError('job_name should not be specified for Horovod.')
if self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale:
if self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec:
raise ValueError('Automatic loss scaling is not supported with NCCL.')
if self.params.variable_update not in ('parameter_server', 'replicated',
'independent'):
raise ValueError('Automatic loss scaling is not supported with '
'variable_update=%s.' % self.params.variable_update)
if self.params.staged_vars:
raise ValueError('Automatic loss scaling is not supported with'
'staged_vars.')
if (self.params.debugger is not None and self.params.debugger != 'cli' and
':' not in self.params.debugger):
raise ValueError('--debugger must be "cli" or in the form '
'host:port')
if self.params.hierarchical_copy and self.params.num_gpus <= 1:
raise ValueError('--hierarchical_copy requires --num_gpus to be greater '
'than 1')
if params.save_model_secs and params.save_model_steps:
raise ValueError('At most one of --save_model_secs and '
'--save_model_steps can be specified')
eval_during_training_flags = list(map(bool, [
params.eval_during_training_every_n_steps,
params.eval_during_training_every_n_epochs,
params.eval_during_training_at_specified_steps,
params.eval_during_training_at_specified_epochs,
]))
if eval_during_training_flags.count(True) > 1:
raise ValueError('At most one flag with --eval_during_training_* prefix '
'must be specified.')
eval_during_training_enabled = any(eval_during_training_flags)
if eval_during_training_enabled:
if params.eval:
raise ValueError('At most one of --eval and --eval_during_training_* '
'must be specified')
if params.forward_only:
raise ValueError('At most one of --forward_only and '
'--eval_during_training_* must be specified')
if params.job_name:
raise ValueError('--eval_during_training_* is not yet supported in '
'distributed mode.')
if params.staged_vars:
raise ValueError('--eval_during_training_* is not currently compatible '
'with --staged_vars')
if params.stop_at_top_1_accuracy and not eval_during_training_enabled:
raise ValueError('--stop_at_top_1_accuracy is only supported with '
'--eval_during_training_*')
if params.collect_eval_results_async and params.model != 'ssd300':
raise ValueError('--collect_eval_results_async only works with ssd300 '
'model currently.')
if self.params.forward_only and self.params.freeze_when_forward_only:
if self.params.train_dir is not None:
raise ValueError('In forward_only mode, when --freeze_when_forward_only'
' is True, --train_dir should not be specified')
if self.params.data_dir and not self.params.datasets_use_prefetch:
raise ValueError('In forward_only mode, when --freeze_when_forward_only'
' is True and --data_dir is set, '
'--datasets_use_prefetch should be set to True')
if self.params.job_name:
raise ValueError('In forward_only mode, when --freeze_when_forward_only'
' is True, --job_name should not be specified and '
'distributed running is not supported')
self.forward_only_and_freeze = True
else:
self.forward_only_and_freeze = False
if self.params.trt_mode:
raise ValueError('--trt_mode should not be specified if one of '
'--forward_only and --freeze_when_forward_only is set '
'to False')
self.mode = get_mode_from_params(self.params)
# Use the batch size from the command line if specified, otherwise use the
# model's default batch size. Scale the benchmark's batch size by the
# number of GPUs.
if self.params.batch_size > 0:
self.model.set_batch_size(self.params.batch_size)
self.batch_size = self.model.get_batch_size() * self.num_gpus
if self.mode in (constants.BenchmarkMode.TRAIN,
constants.BenchmarkMode.TRAIN_AND_EVAL):
self.train_batch_size = self.batch_size
else:
self.train_batch_size = None
if self.mode in (constants.BenchmarkMode.EVAL,
constants.BenchmarkMode.TRAIN_AND_EVAL):
if self.params.eval_batch_size > 0:
self.eval_batch_size = self.params.eval_batch_size * self.num_gpus
else:
self.eval_batch_size = self.batch_size
else:
self.eval_batch_size = None
self.batch_group_size = self.params.batch_group_size
self.enable_auto_loss_scale = (
self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale)
self.loss_scale = None
self.loss_scale_normal_steps = None
self.job_name = self.params.job_name # "" for local training
# PS server is used for distributed jobs not using all-reduce.
use_ps_server = self.job_name and (self.params.variable_update !=
'distributed_all_reduce' and
self.params.variable_update !=
'collective_all_reduce')
# controller is used for distributed_all_reduce with > 1 worker.
use_controller = (
self.params.variable_update == 'distributed_all_reduce' and
self.job_name)
if use_controller and not params.controller_host:
raise ValueError('When variable_update==distributed_all_reduce '
'controller_host must also be specified.')
self.single_session = (
self.params.variable_update == 'distributed_all_reduce')
# collective_all_reduce doesn't need a controller or ps
self.distributed_collective = (
self.params.variable_update == 'collective_all_reduce' and
self.job_name)
self.local_parameter_device_flag = self.params.local_parameter_device
if self.job_name:
self.task_index = self.params.task_index
self.cluster_manager = platforms_util.get_cluster_manager(
params, create_config_proto(params))
assert isinstance(self.cluster_manager, cnn_util.BaseClusterManager)
worker_prefix = '/job:worker/replica:0/task:%s' % self.task_index
if use_ps_server:
self.param_server_device = tf.train.replica_device_setter(
worker_device=worker_prefix + '/cpu:0',
cluster=self.cluster_manager.get_cluster_spec())
# This device on which the queues for managing synchronization between
# servers should be stored.
self.sync_queue_devices = [
'/job:ps/replica:0/task:%s/cpu:0' % i
for i in range(self.cluster_manager.num_ps())
]
else:
self.sync_queue_devices = ['/job:worker/replica:0/task:0/cpu:0']
else:
self.task_index = 0
self.cluster_manager = None
worker_prefix = ''
self.param_server_device = '/%s:0' % self.params.local_parameter_device
self.sync_queue_devices = [self.param_server_device]
if self.cluster_manager:
self.num_workers = self.cluster_manager.num_workers()
elif self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
self.num_workers = hvd.size()
else:
self.num_workers = 1
self.num_ps = self.cluster_manager.num_ps() if self.cluster_manager else 0
if self.num_workers > 1 and self.params.all_reduce_spec == 'nccl':
raise ValueError('--all_reduce_spec=nccl is invalid in a '
'multi-worker job')
# Device to use for ops that need to always run on the local worker's CPU.
self.cpu_device = '%s/cpu:0' % worker_prefix
# Device to use for ops that need to always run on the local worker's
# compute device, and never on a parameter server device.
self.raw_devices = [
'%s/%s:%i' % (worker_prefix, self.params.device, i)
for i in xrange(self.num_gpus)
]
subset = 'validation' if params.eval else 'train'
self.num_batches, self.num_epochs = get_num_batches_and_epochs(
params, self.batch_size * self.num_workers,
self.dataset.num_examples_per_epoch(subset))
if self.mode in (constants.BenchmarkMode.EVAL,
constants.BenchmarkMode.TRAIN_AND_EVAL):
# TODO(reedwm): Currently we do extra eval logic for num_eval_batches and
# the preprocessor. We should encapsulate this logic into a shared
# function or class.
if params.num_eval_batches is None and params.num_eval_epochs is None:
eval_params = self.params
else:
eval_params = self.params._replace(
num_batches=self.params.num_eval_batches,
num_epochs=self.params.num_eval_epochs)
self.num_eval_batches, self.num_eval_epochs = get_num_batches_and_epochs(
eval_params, self.eval_batch_size * self.num_workers,
self.dataset.num_examples_per_epoch('validation'))
else:
self.num_eval_batches, self.num_eval_epochs = None, None
num_train_examples_per_epoch = self.dataset.num_examples_per_epoch('train')
if self.params.eval_during_training_every_n_epochs:
n_epochs = self.params.eval_during_training_every_n_epochs
self.eval_during_training_at_specified_steps = {
(int(e * num_train_examples_per_epoch + self.batch_size - 1) //
self.batch_size)
for e in np.arange(n_epochs, self.num_epochs, n_epochs)}
if self.params.eval_during_training_at_specified_steps:
try:
self.eval_during_training_at_specified_steps = set(map(
int, self.params.eval_during_training_at_specified_steps))
except ValueError:
raise ValueError('Param eval_during_training_at_specified_steps value '
'of %s cannot be converted to a list of integers.' %
(self.params.eval_during_training_at_specified_steps))
if self.params.eval_during_training_at_specified_epochs:
try:
n_epochs = list(map(
float, self.params.eval_during_training_at_specified_epochs))
offset = n_epochs[0] - 1
if offset.is_integer():
offset = int(offset)
mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
self.eval_during_training_at_specified_steps = {
(int(e * num_train_examples_per_epoch + self.batch_size - 1) //
self.batch_size)
for e in n_epochs}
except ValueError:
raise ValueError('Param eval_during_training_at_specified_epochs value '
'of %s cannot be converted to a list of floats.' %
(self.params.eval_during_training_at_specified_epochs))
if params.eval_during_training_every_n_epochs:
offset = params.eval_during_training_every_n_epochs - 1
if offset.is_integer():
offset = int(offset)
mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
if (self.params.staged_vars and
self.params.variable_update != 'parameter_server'):
raise ValueError('staged_vars for now is only supported with '
'variable_update=parameter_server')
if self.params.variable_update == 'parameter_server':
if self.job_name:
if not self.params.staged_vars:
self.variable_mgr = variable_mgr.VariableMgrDistributedFetchFromPS(
self)
else:
self.variable_mgr = (
variable_mgr.VariableMgrDistributedFetchFromStagedPS(self))
else:
if not self.params.staged_vars:
self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(self)
else:
self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromStagedPS(
self)
elif self.params.variable_update == 'replicated':
if self.job_name:
raise ValueError('Invalid variable_update in distributed mode: %s' %
self.params.variable_update)
self.variable_mgr = variable_mgr.VariableMgrLocalReplicated(
self, self.params.all_reduce_spec,
self.params.agg_small_grads_max_bytes,
self.params.agg_small_grads_max_group,
self.params.allreduce_merge_scope)
elif self.params.variable_update == 'distributed_all_reduce':
assert self.params.cross_replica_sync
self.variable_mgr = variable_mgr.VariableMgrDistributedAllReduce(
self, self.params.all_reduce_spec,
('worker' if self.num_workers > 1 else 'localhost'),
self.num_workers, self.params.agg_small_grads_max_bytes,
self.params.agg_small_grads_max_group,
self.params.allreduce_merge_scope)
elif self.params.variable_update == 'collective_all_reduce':
assert self.params.cross_replica_sync
self.variable_mgr = variable_mgr.VariableMgrCollectiveAllReduce(
self, self.params.all_reduce_spec,
self.num_workers, self.num_gpus, self.task_index,
self.params.allreduce_merge_scope)
elif self.params.variable_update == 'distributed_replicated':
assert self.params.cross_replica_sync
if not self.job_name:
raise ValueError('Invalid variable_update in local mode: %s' %
self.params.variable_update)
self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated(self)
elif self.params.variable_update in ('independent', 'horovod'):
if self.job_name:
raise ValueError('Invalid variable_update in distributed mode: %s' %
self.params.variable_update)
self.variable_mgr = variable_mgr.VariableMgrIndependent(self)
else:
raise ValueError(
'Invalid variable_update: %s' % self.params.variable_update)
# Device to use for running on the local worker's compute device, but
# with variables assigned to parameter server devices.
self.devices = self.variable_mgr.get_devices()
if self.job_name:
if use_ps_server:
self.global_step_device = self.param_server_device
elif self.params.variable_update == 'collective_all_reduce':
self.global_step_device = self.cpu_device
else:
self.global_step_device = '/job:worker/replica:0/task:0/cpu:0'
else:
self.global_step_device = self.cpu_device
self.input_preprocessor = None
self.eval_input_preprocessor = None
if not self.dataset.use_synthetic_gpu_inputs():
if not self.params.eval:
self.input_preprocessor = self.get_input_preprocessor()
if self.mode in (constants.BenchmarkMode.EVAL,
constants.BenchmarkMode.TRAIN_AND_EVAL):
with self._do_eval():
self.eval_input_preprocessor = self.get_input_preprocessor()
self.datasets_use_prefetch = (
self.params.datasets_use_prefetch and
# TODO(rohanj): Figure out why --datasets_use_prefetch freezes on the
# CPU.
self.params.device.lower() != 'cpu' and
self.input_preprocessor and
self.input_preprocessor.supports_datasets())
self.init_global_step = 0
self._config_benchmark_logger()
if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
# Remove "eval" from params so it is not accidentally used. Since eval can
# still occur despite params.eval being False, params.eval should never
# be used. We cannot yet remove this unconditionally, because the SSD
# model still uses params.eval, and hence does not work properly with
# --eval_during_training_*.
# TODO(b/116627045): We should also remove fields that have an eval
# equivalent, like num_batches and num_eval_batches.
self.params = remove_param_fields(self.params, {'eval'})
@contextlib.contextmanager
def _do_eval(self):
"""Context manager to switches BenchmarkCNN to eval mode.
Any evaluation code should be put under this context manager. This context
manager switches self._doing_eval to True. It also switches certain
attributes, like self.num_batches and self.num_epochs, to be the number of
batches and epochs for evaluation respectively
Yields:
Nothing.
"""
# TODO(b/116627045): Find a more general way of switching attributes to the
# eval equivalents.
old_doing_eval = self._doing_eval
old_num_batches = self.num_batches
old_num_epochs = self.num_epochs
old_batch_size = self.batch_size
try:
self._doing_eval = True
self.num_batches = self.num_eval_batches
self.num_epochs = self.num_eval_epochs
self.batch_size = self.eval_batch_size
self.model.set_batch_size(self.eval_batch_size // self.num_gpus)
yield
finally:
self._doing_eval = old_doing_eval
self.num_batches = old_num_batches
self.num_epochs = old_num_epochs
self.batch_size = old_batch_size
self.model.set_batch_size(old_batch_size // self.num_gpus)
def _config_benchmark_logger(self):
"""Config the model garden benchmark logger."""
model_benchmark_logger = None
if self.params.benchmark_log_dir is not None:
try:
from official.r1.utils.logs import logger as models_logger # pylint: disable=g-import-not-at-top
except ImportError:
tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH '
'in order to use BenchmarkLogger. Configured '
'benchmark_log_dir: %s'
% self.params.benchmark_log_dir)
raise
model_benchmark_logger = models_logger.BenchmarkFileLogger(
self.params.benchmark_log_dir)
self.benchmark_logger = model_benchmark_logger
# TODO(laigd): this changes the global device list which is used everywhere,
# consider refactoring it.
def reset_devices_for_task(self, task_num, is_local=False):
"""Used to imitate another task when building a distributed graph."""
worker_prefix = ('/job:localhost' if is_local else
'/job:worker/replica:0/task:%s' % task_num)
self.cpu_device = '%s/cpu:0' % worker_prefix
self.raw_devices = [
'%s/%s:%i' % (worker_prefix, self.params.device, i)
for i in xrange(self.num_gpus)
]
self.devices = self.variable_mgr.get_devices()
def raw_devices_across_tasks(self, is_local=False):
"""Returns list of raw device names across all tasks."""
if is_local:
assert self.num_workers == 1
return self.raw_devices
else:
return [
'job:worker/replica:0/task%s/%s:%i' % (t, self.params.device, i)
for t in xrange(self.num_workers)
for i in xrange(self.num_gpus)
]
def print_info(self):
"""Print basic information."""
benchmark_info = self._get_params_info()
log_fn('Model: %s' % self.model.get_model_name())
log_fn('Dataset: %s' % benchmark_info['dataset_name'])
log_fn('Mode: %s' % self.mode)
log_fn('SingleSess: %s' % benchmark_info['single_session'])
log_fn('Batch size: %s global' % (self.batch_size * self.num_workers))
log_fn(' %s per device' % (self.batch_size //
len(self.raw_devices)))
if self.batch_group_size > 1:
log_fn(' %d batches per prepocessing group' %
self.batch_group_size)
log_fn('Num batches: %d' % self.num_batches)
log_fn('Num epochs: %.2f' % self.num_epochs)
log_fn('Devices: %s' % benchmark_info['device_list'])
log_fn('NUMA bind: %s' % self.params.use_numa_affinity)
log_fn('Data format: %s' % self.params.data_format)
if self.rewriter_config:
log_fn('RewriterConfig: %s' % self.rewriter_config)
log_fn('Optimizer: %s' % self.params.optimizer)
log_fn('Variables: %s' % self.params.variable_update)
if (self.params.variable_update == 'replicated' or
self.params.variable_update == 'distributed_all_reduce'
or self.params.variable_update == 'collective_all_reduce'):
log_fn('AllReduce: %s' % self.params.all_reduce_spec)
if self.job_name:
log_fn('Sync: %s' % self.params.cross_replica_sync)
if self.params.staged_vars:
log_fn('Staged vars: %s' % self.params.staged_vars)
if self.params.variable_update == 'horovod' and self.params.horovod_device:
log_fn('Horovod on: %s' % self.params.horovod_device)
log_fn('==========')
def _get_params_info(self):
"""Get the common parameters info for the benchmark run.
Returns:
A dict of processed parameters.
"""
dataset_name = self.dataset.name
if self.dataset.use_synthetic_gpu_inputs():
dataset_name += ' (synthetic)'
single_session = self.params.variable_update == 'distributed_all_reduce'
if single_session:
device_list = self.raw_devices_across_tasks()
elif self.params.variable_update == 'horovod':
device_list = ['horovod/%s:%d' % (self.params.device, idx)
for idx in range(self.num_workers)]
else:
device_list = self.raw_devices
return {
'dataset_name': dataset_name,
'single_session': single_session,
'device_list': device_list,}
def _log_benchmark_run(self):
"""Log the benchmark info to the logger.
The info logged here should be similar to print_info(), but in a structured
JSON format.
"""
if self.benchmark_logger:
benchmark_info = self._get_params_info()
run_param = {
'model': self.model.get_model_name(),
'dataset': benchmark_info['dataset_name'],
'mode': self.mode,
'single_sess': benchmark_info['single_session'],
'devices': benchmark_info['device_list'],
'batch_size': self.batch_size,
'batch_size_per_device': self.batch_size // len(self.raw_devices),
'num_batches': self.num_batches,
'num_epochs': self.num_epochs,
'data_format': self.params.data_format,
'rewrite_config': self.rewriter_config,
'optimizer': self.params.optimizer,
'session_config': create_config_proto(self.params),
}
# TODO(scottzhu): tf_cnn_benchmark might execute several times with
# different param setting on the same box. This will cause the run file to
# only contain the latest info. The benchmark_log_dir should be updated
# for every new run.
self.benchmark_logger.log_run_info(
self.model.get_model_name(), benchmark_info['dataset_name'],
run_param, test_id=self.params.benchmark_test_id)
def run(self):
"""Run the benchmark task assigned to this process.
Returns:
Dictionary of statistics for training or eval.
Raises:
ValueError: unrecognized job name.
"""
if self.params.job_name == 'ps':
log_fn('Running parameter server %s' % self.task_index)
self.cluster_manager.join_server()
return {}
# For distributed_all_reduce with multiple workers, drive
# from a separate controller process.
if self.params.variable_update == 'distributed_all_reduce':
if self.params.job_name == 'worker':
log_fn('Starting worker %s' % self.task_index)
self.cluster_manager.join_server()
return
elif self.params.job_name and self.params.job_name != 'controller':
raise ValueError('unrecognized job name: %s' % self.params.job_name)
self._log_benchmark_run()
if self._doing_eval:
with tf.Graph().as_default():
# TODO(laigd): freeze the graph in eval mode.
return self._run_eval()
else:
return self._benchmark_train()
def _run_eval(self):
"""Evaluate a model every self.params.eval_interval_secs.
Returns:
Dictionary containing eval statistics. Currently returns an empty
dictionary.
Raises:
ValueError: If self.params.train_dir is unspecified.
"""
if self.params.train_dir is None:
raise ValueError('Trained model directory not specified')
graph_info = self._build_eval_graph()
saver = tf.train.Saver(self.variable_mgr.savable_variables())
summary_writer = tf.summary.FileWriter(self.params.eval_dir,
tf.get_default_graph())
target = ''
# TODO(huangyp): Check if checkpoints haven't updated for hours and abort.
while True:
with tf.Session(
target=target, config=create_config_proto(self.params)) as sess:
image_producer = None
try:
global_step = load_checkpoint(saver, sess, self.params.train_dir)
image_producer = self._initialize_eval_graph(
graph_info.enqueue_ops, graph_info.input_producer_op,
graph_info.local_var_init_op_group, sess)
except CheckpointNotFoundException:
log_fn('Checkpoint not found in %s' % self.params.train_dir)
else: # Only executes if an exception was not thrown
self._eval_once(sess, summary_writer, graph_info.fetches,
graph_info.summary_op, image_producer, global_step)
if image_producer is not None:
image_producer.done()
if self.params.eval_interval_secs <= 0:
break
time.sleep(self.params.eval_interval_secs)
return {}
def _build_eval_graph(self, scope_name=None):
"""Build the evaluation graph.
Args:
scope_name: String to filter what summaries are collected. Only summary
ops whose name contains `scope_name` will be added, which is useful for
only including evaluation ops.
Returns:
A GraphInfo named_tuple containing various useful ops and tensors of the
evaluation grpah.
"""
with self._do_eval():
input_producer_op, enqueue_ops, fetches = self._build_model()
local_var_init_op = tf.local_variables_initializer()
table_init_ops = tf.tables_initializer()
variable_mgr_init_ops = [local_var_init_op]
if table_init_ops:
variable_mgr_init_ops.extend([table_init_ops])
with tf.control_dependencies([local_var_init_op]):
variable_mgr_init_ops.extend(self.variable_mgr.get_post_init_ops())
local_var_init_op_group = tf.group(*variable_mgr_init_ops)
summary_op = tf.summary.merge_all(scope=scope_name)
# The eval graph has no execution barrier because it doesn't run in
# distributed mode.
execution_barrier = None
# We do not use the global step during evaluation.
global_step = None
return GraphInfo(input_producer_op, enqueue_ops, fetches,
execution_barrier, global_step, local_var_init_op_group,
summary_op)
# TODO(reedwm): For consistency, we should have a similar
# "_initialize_train_graph" function. They can likely be the same function.
def _initialize_eval_graph(self, enqueue_ops, input_producer_op,
local_var_init_op_group, sess):
"""Initializes the evaluation graph.
Args:
enqueue_ops: Ops that adds the preprocessed images to the staging areas.
input_producer_op: Op that produce the input batches (before
preprocessing).
local_var_init_op_group: Group of ops that perform per-device
initialization work.
sess: The session to initialize the eval graph with.
Returns:
An ImageProducer, or None if an ImageProducer isn't being used.
"""
with self._do_eval():
if local_var_init_op_group is not None:
# We might reinitialize local variables if they were already initialized
# during training. This is OK.
sess.run(local_var_init_op_group)
if self.dataset.queue_runner_required():
tf.train.start_queue_runners(sess=sess)
image_producer = None
if input_producer_op is not None:
image_producer = cnn_util.ImageProducer(
sess, input_producer_op, self.batch_group_size,
self.params.use_python32_barrier)
image_producer.start()
if enqueue_ops:
for i in xrange(len(enqueue_ops)):
sess.run(enqueue_ops[:(i + 1)])
if image_producer is not None:
image_producer.notify_image_consumption()
return image_producer
def _eval_once(self, sess, summary_writer, fetches, summary_op,
image_producer, global_step):
"""Evaluate the model using the validation dataset."""
with self._do_eval():
mlperf.logger.log_eval_epoch(
mlperf.tags.EVAL_START, global_step, self.batch_size)
loop_start_time = start_time = time.perf_counter()
# TODO(laigd): refactor the part to compute/report the accuracy. Currently
# it only works for image models.
top_1_accuracy_sum = 0.0
top_5_accuracy_sum = 0.0
total_eval_count = self.num_batches * self.batch_size
for step in xrange(self.num_batches):
if (summary_writer and self.params.save_summaries_steps > 0 and
(step + 1) % self.params.save_summaries_steps == 0):
results, summary_str = sess.run([fetches, summary_op])
summary_writer.add_summary(summary_str)
else:
results = sess.run(fetches)
# Make global_step available in results for postprocessing.
results['global_step'] = global_step
results = self.model.postprocess(results)
top_1_accuracy_sum += results['top_1_accuracy']
top_5_accuracy_sum += results['top_5_accuracy']
if (step + 1) % self.params.display_every == 0:
duration = time.perf_counter() - start_time
examples_per_sec = (
self.batch_size * self.params.display_every / duration)
log_fn('%i\t%.1f examples/sec' % (step + 1, examples_per_sec))
start_time = time.perf_counter()
if image_producer is not None:
image_producer.notify_image_consumption()
loop_end_time = time.perf_counter()
accuracy_at_1 = top_1_accuracy_sum / self.num_batches
accuracy_at_5 = top_5_accuracy_sum / self.num_batches
summary = tf.Summary()
summary.value.add(tag='eval/Accuracy@1', simple_value=accuracy_at_1)
summary.value.add(tag='eval/Accuracy@5', simple_value=accuracy_at_5)
for result_key, result_value in results.items():
if result_key.startswith(constants.SIMPLE_VALUE_RESULT_PREFIX):
prefix_len = len(constants.SIMPLE_VALUE_RESULT_PREFIX)
summary.value.add(tag='eval/' + result_key[prefix_len:],
simple_value=result_value)
if summary_writer:
summary_writer.add_summary(summary, global_step)
log_fn('Accuracy @ 1 = %.4f Accuracy @ 5 = %.4f [%d examples]' %
(accuracy_at_1, accuracy_at_5, total_eval_count))
elapsed_time = loop_end_time - loop_start_time
images_per_sec = (self.num_batches * self.batch_size / elapsed_time)
if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
# Note that we compute the top 1 accuracy and top 5 accuracy for each
# batch, which will have a slight performance impact.
log_fn('-' * 64)
log_fn('total images/sec: %.2f' % images_per_sec)
log_fn('-' * 64)
if self.benchmark_logger:
eval_result = {
'eval_top_1_accuracy', accuracy_at_1,
'eval_top_5_accuracy', accuracy_at_5,
'eval_average_examples_per_sec', images_per_sec,
tf.GraphKeys.GLOBAL_STEP, global_step,
}
self.benchmark_logger.log_evaluation_result(eval_result)
mlperf.logger.log_eval_epoch(
mlperf.tags.EVAL_STOP, global_step, self.batch_size)
mlperf.logger.log(key=mlperf.tags.EVAL_SIZE,
value=self.num_batches * self.batch_size)
if self.params.model != 'ssd300': # ssd300 logs eval accuracy elsewhere.
mlperf.logger.log_eval_accuracy(
accuracy_at_1, global_step, self.train_batch_size,
examples_per_epoch=self.dataset.num_examples_per_epoch('train'))
if self.params.stop_at_top_1_accuracy:
mlperf.logger.log(key=mlperf.tags.EVAL_TARGET,
value=self.params.stop_at_top_1_accuracy)
return accuracy_at_1, accuracy_at_5
def _benchmark_train(self):
"""Run cnn in benchmark mode. Skip the backward pass if forward_only is on.
Returns:
Dictionary containing training statistics (num_workers, num_steps,
average_wall_time, images_per_sec).
"""
graph = tf.Graph()
with graph.as_default():
build_result = self._build_graph()
if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
with self.variable_mgr.reuse_variables():
with tf.name_scope('Evaluation') as ns:
eval_build_results = self._build_eval_graph(ns)
else:
eval_build_results = None
(graph, result_to_benchmark) = self._preprocess_graph(graph, build_result)
with graph.as_default():
return self._benchmark_graph(result_to_benchmark, eval_build_results)
GPU_CACHED_INPUT_VARIABLE_NAME = 'gpu_cached_inputs'
def _unfreezable_local_variables(self, graph):
"""Get the local variables that we don't want to freeze."""
return graph.get_collection(
tf.GraphKeys.LOCAL_VARIABLES,
# We don't freeze the gpu_cached_images local variable so it won't get
# constant folded with ops which process the input.
scope='.*' + BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME)
def _build_graph(self):
"""Build the graph.
Returns:
A namedtuple containing the ops/tensors that required by
_benchmark_graph().
"""
if self.single_session:
(input_producer_op, enqueue_ops, fetches) = (
self._build_model_single_session())
else:
(input_producer_op, enqueue_ops, fetches) = self._build_model()
fetches_list = nest.flatten(list(fetches.values()))
main_fetch_group = tf.group(*fetches_list, name='main_fetch_group')
execution_barrier = None
if (not self.single_session and self.job_name and
not self.params.cross_replica_sync):
execution_barrier = self.add_sync_queues_and_barrier(
'execution_barrier_', [])
global_step = tf.train.get_global_step()
with tf.device(self.global_step_device), tf.name_scope('inc_global_step'):
with tf.control_dependencies([main_fetch_group]):
fetches['inc_global_step'] = global_step.assign_add(1)
if ((not self.single_session) and (not self.distributed_collective) and
self.job_name and self.params.cross_replica_sync):
# Block all replicas until all replicas are ready for next step.
fetches['sync_queues'] = self.add_sync_queues_and_barrier(
'sync_queues_step_end_', [main_fetch_group])
# Skips the init ops for freezable local variables in forward_only mode so
# we can remove all the assign ops when converting variables to constants.
with tf.name_scope('local_variable_initialization'):
if self.forward_only_and_freeze:
local_var_init_op = tf.variables_initializer(
self._unfreezable_local_variables(tf.get_default_graph()))
else:
local_var_init_op = tf.local_variables_initializer()
table_init_ops = tf.tables_initializer()
variable_manager_init_ops = [local_var_init_op]
if table_init_ops:
variable_manager_init_ops.extend([table_init_ops])
if not self.forward_only_and_freeze:
with tf.control_dependencies([local_var_init_op]):
variable_manager_init_ops.extend(self.variable_mgr.get_post_init_ops())
if ((not self.single_session) and (not self.distributed_collective) and
self.job_name and self.params.cross_replica_sync):
# Ensure all workers execute variable_manager_init_ops before they start
# executing the model.
variable_manager_init_ops.append(
self.add_sync_queues_and_barrier('init_ops_end_',
variable_manager_init_ops))
local_var_init_op_group = tf.group(*variable_manager_init_ops,
name='local_var_init_op_group')
summary_op = tf.summary.merge_all()
return GraphInfo(
input_producer_op=input_producer_op,
enqueue_ops=enqueue_ops,
fetches=fetches,
execution_barrier=execution_barrier,
global_step=global_step,
local_var_init_op_group=local_var_init_op_group,
summary_op=summary_op)
def _benchmark_graph(self, graph_info, eval_graph_info):
"""Benchmark the training graph.
Args:
graph_info: the namedtuple returned by _build_graph() which
contains all necessary information to benchmark the graph, including
named tensors/ops list, fetches, etc.
eval_graph_info: Similar to graph_info but for the eval graph if
--eval_during_training_* is used. Otherwise, None.
Returns:
Dictionary containing training statistics (num_workers, num_steps,
average_wall_time, images_per_sec).
"""
log_fn('Initializing graph')
if self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
# First worker will be 'chief' - it will write summaries and
# save checkpoints.
is_chief = hvd.rank() == 0
else:
is_chief = (not self.job_name or self.task_index == 0)
summary_writer = None
if (is_chief and self.params.summary_verbosity and self.params.train_dir and
self.params.save_summaries_steps > 0):
summary_writer = tf.summary.FileWriter(self.params.train_dir,
tf.get_default_graph())
# We want to start the benchmark timer right after a image_producer barrier
# and avoids undesired waiting times on barriers.
if ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
self.batch_group_size) != 0:
self.num_warmup_batches = int(
math.ceil(
(self.num_warmup_batches + len(graph_info.enqueue_ops) - 1.0) /
(self.batch_group_size)) * self.batch_group_size -
len(graph_info.enqueue_ops) + 1)
log_fn('Round up warm up steps to %d to match batch_group_size' %
self.num_warmup_batches)
assert ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
self.batch_group_size) == 0
# We run the summaries in the same thread as the training operations by
# passing in None for summary_op to avoid a summary_thread being started.
# Running summaries and training operations in parallel could run out of
# GPU memory.
if is_chief and not self.forward_only_and_freeze:
saver = tf.train.Saver(
self.variable_mgr.savable_variables(),
save_relative_paths=True,
max_to_keep=self.params.max_ckpts_to_keep)
else:
saver = None
ready_for_local_init_op = None
if self.job_name and not (self.single_session or
self.distributed_collective):
# In distributed mode, we don't want to run local_var_init_op_group until
# the global variables are initialized, because local_var_init_op_group
# may use global variables (such as in distributed replicated mode). We
# don't set this in non-distributed mode, because in non-distributed mode,
# local_var_init_op_group may itself initialize global variables (such as
# in replicated mode).
ready_for_local_init_op = tf.report_uninitialized_variables(
tf.global_variables())
if self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
bcast_global_variables_op = hvd.broadcast_global_variables(0)
else:
bcast_global_variables_op = None
if self.params.variable_update == 'collective_all_reduce':
# It doesn't matter what this collective_graph_key value is,
# so long as it's > 0 and the same at every worker.
init_run_options = tf.RunOptions()
init_run_options.experimental.collective_graph_key = 6
else:
init_run_options = tf.RunOptions()
local_var_init_ops = [graph_info.local_var_init_op_group]
if eval_graph_info:
# `eval_graph_info.local_var_init_op_group` also includes some of the
# training initializer ops, since it's difficult to filter them out.
# Rerunning the training initializer ops is OK, but we add a control
# dependency since running two sets of training initializer ops at the
# same time can cause race conditions.
with tf.control_dependencies(local_var_init_ops):
local_var_init_ops.append(eval_graph_info.local_var_init_op_group)
sv = tf.train.Supervisor(
# For the purpose of Supervisor, all Horovod workers are 'chiefs',
# since we want session to be initialized symmetrically on all the
# workers.
is_chief=is_chief or (self.params.variable_update == 'horovod'
or self.distributed_collective),
# Log dir should be unset on non-chief workers to prevent Horovod
# workers from corrupting each other's checkpoints.
logdir=self.params.train_dir if is_chief else None,
ready_for_local_init_op=ready_for_local_init_op,
local_init_op=local_var_init_ops,
saver=saver,
global_step=graph_info.global_step,
summary_op=None,
save_model_secs=self.params.save_model_secs,
summary_writer=summary_writer,
local_init_run_options=init_run_options)
profiler = tf.profiler.Profiler() if self.params.tfprof_file else None
if self.graph_file is not None:
path, filename = os.path.split(self.graph_file)
as_text = filename.endswith('txt')
log_fn('Writing GraphDef as %s to %s' % ( # pyformat break
'text' if as_text else 'binary', self.graph_file))
tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
path, filename, as_text)
start_standard_services = (
self.params.train_dir or
self.dataset.queue_runner_required())
target = self.cluster_manager.get_target() if self.cluster_manager else ''
with sv.managed_session(
master=target,
config=create_config_proto(self.params),
start_standard_services=start_standard_services) as sess:
# Anything that can potentially raise an OutOfRangeError with 'sess' MUST
# be under this try block. The managed_session() context manager silently
# ignores OutOfRangeError, so we must catch them and wrap them with
# a different exception type so that they can be propagated up to the
# caller.
try:
stats = self.benchmark_with_session(
sess, sv, graph_info, eval_graph_info, bcast_global_variables_op,
is_chief, summary_writer, profiler)
except tf.errors.OutOfRangeError:
raise RuntimeError(
'Received OutOfRangeError. Wrapping in Runtime error to avoid '
'Supervisor from suppressing the error. Original OutOfRangeError '
'with traceback:\n' + traceback.format_exc())
sv.stop()
if profiler:
generate_tfprof_profile(profiler, self.params.tfprof_file)
return stats
def benchmark_with_session(self, sess, supervisor, graph_info,
eval_graph_info, bcast_global_variables_op,
is_chief, summary_writer, profiler):
"""Benchmarks the graph with the given session.
Args:
sess: The session to benchmark the graph with
supervisor: The Supervisor that created the session.
graph_info: the namedtuple returned by _build_graph() which
contains all necessary information to benchmark the graph, including
named tensors/ops list, fetches, etc.
eval_graph_info: Similar to graph_info but for the eval graph if
--eval_during_training_every_n_steps is used. Otherwise, None.
bcast_global_variables_op: If Horovod is used, the op to broadcast the
global variables to all the processes. None if Horovod is not used.
is_chief: True if this is the chief process.
summary_writer: The SummaryWriter used to write summaries, or None if
summaries are not used.
profiler: The tf.profiler.Profiler, or None if tfprof is not used.
Returns:
Dictionary containing training statistics (num_workers, num_steps,
average_wall_time, images_per_sec).
"""
if self.params.backbone_model_path is not None:
self.model.load_backbone_model(sess, self.params.backbone_model_path)
if bcast_global_variables_op:
sess.run(bcast_global_variables_op)
image_producer = None
if graph_info.input_producer_op is not None:
image_producer = cnn_util.ImageProducer(
sess, graph_info.input_producer_op, self.batch_group_size,
self.params.use_python32_barrier)
image_producer.start()
if graph_info.enqueue_ops:
for i in xrange(len(graph_info.enqueue_ops)):
sess.run(graph_info.enqueue_ops[:(i + 1)])
if image_producer is not None:
image_producer.notify_image_consumption()
self.init_global_step, = sess.run([graph_info.global_step])
if self.job_name and not self.params.cross_replica_sync:
# TODO(zhengxq): Do we need to use a global step watcher at all?
global_step_watcher = GlobalStepWatcher(
sess, graph_info.global_step,
self.num_workers * self.num_warmup_batches +
self.init_global_step,
self.num_workers * (self.num_warmup_batches + self.num_batches) - 1)
global_step_watcher.start()
else:
global_step_watcher = None
eval_image_producer = None
if eval_graph_info:
# We pass local_var_init_op_group=None because the Supervisor already
# initialized local variables above. We need to have the Supervisor
# initialize the local variables, because otherwise it throws an error
# complaining that not all variables were initialized.
eval_image_producer = self._initialize_eval_graph(
eval_graph_info.enqueue_ops, eval_graph_info.input_producer_op,
local_var_init_op_group=None, sess=sess)
step_train_times = []
log_fn('Running warm up')
local_step = -1 * self.num_warmup_batches
if self.single_session:
# In single session mode, each step, the global_step is incremented by
# 1. In non-single session mode, each step, the global_step is
# incremented once per worker. This means we need to divide
# init_global_step by num_workers only in non-single session mode.
end_local_step = self.num_batches - self.init_global_step
else:
end_local_step = self.num_batches - (self.init_global_step //
self.num_workers)
if not global_step_watcher:
# In cross-replica sync mode, all workers must run the same number of
# local steps, or else the workers running the extra step will block.
done_fn = lambda: local_step >= end_local_step
else:
done_fn = global_step_watcher.done
if self.params.debugger is not None:
if self.params.debugger == 'cli':
log_fn('The CLI TensorFlow debugger will be used.')
sess = tf_debug.LocalCLIDebugWrapperSession(sess)
else:
log_fn('The TensorBoard debugger plugin will be used.')
sess = tf_debug.TensorBoardDebugWrapperSession(sess,
self.params.debugger)
mlperf.logger.log(key=mlperf.tags.TRAIN_LOOP)
skip_final_eval = False
accuracy_at_1 = None
accuracy_at_5 = None
last_eval_step = local_step
loop_start_time = time.perf_counter()
last_average_loss = None
while not done_fn():
if local_step == 0:
log_fn('Done warm up')
if graph_info.execution_barrier:
log_fn('Waiting for other replicas to finish warm up')
sess.run([graph_info.execution_barrier])
# TODO(laigd): rename 'Img' to maybe 'Input'.
header_str = ('Step\tImg/sec\t' +
self.params.loss_type_to_report.replace('/', ' '))
if self.params.print_training_accuracy or self.params.forward_only:
# TODO(laigd): use the actual accuracy op names of the model.
header_str += '\ttop_1_accuracy\ttop_5_accuracy'
log_fn(header_str)
assert len(step_train_times) == self.num_warmup_batches
# reset times to ignore warm up batch
step_train_times = []
loop_start_time = time.perf_counter()
if (summary_writer and
(local_step + 1) % self.params.save_summaries_steps == 0):
fetch_summary = graph_info.summary_op
else:
fetch_summary = None
collective_graph_key = 7 if (
self.params.variable_update == 'collective_all_reduce') else 0
(summary_str, last_average_loss) = benchmark_one_step(
sess, graph_info.fetches, local_step,
self.batch_size * (self.num_workers
if self.single_session else 1), step_train_times,
self.trace_filename, self.params.partitioned_graph_file_prefix,
profiler, image_producer, self.params, fetch_summary,
benchmark_logger=self.benchmark_logger,
collective_graph_key=collective_graph_key,
should_output_files=(self.params.variable_update != 'horovod' or
is_chief))
if summary_str is not None and is_chief:
supervisor.summary_computed(sess, summary_str)
local_step += 1
if (self.params.save_model_steps and
local_step % self.params.save_model_steps == 0 and
local_step > 0 and
is_chief):
supervisor.saver.save(sess, supervisor.save_path,
supervisor.global_step)
if (eval_graph_info and local_step > 0 and not done_fn() and
self._should_eval_during_training(local_step)):
python_global_step = sess.run(graph_info.global_step)
num_steps_since_last_eval = local_step - last_eval_step
# The INPUT_SIZE tag value might not match the
# PREPROC_NUM_TRAIN_EXAMPLES tag value, because the number of examples
# run, which is INPUT_SIZE, is rounded up to the nearest multiple of
# self.batch_size.
mlperf.logger.log(
key=mlperf.tags.INPUT_SIZE,
value=num_steps_since_last_eval * self.batch_size)
log_fn('Running evaluation at global_step {}'.format(
python_global_step))
accuracy_at_1, accuracy_at_5 = self._eval_once(
sess, summary_writer, eval_graph_info.fetches,
eval_graph_info.summary_op, eval_image_producer,
python_global_step)
last_eval_step = local_step
if (self.params.stop_at_top_1_accuracy and
accuracy_at_1 >= self.params.stop_at_top_1_accuracy):
log_fn('Stopping, as eval accuracy at least %s was reached' %
self.params.stop_at_top_1_accuracy)
skip_final_eval = True
break
else:
log_fn('Resuming training')
if eval_graph_info and self.model.reached_target():
log_fn('Stopping, as the model indicates its custom goal was reached')
skip_final_eval = True
break
loop_end_time = time.perf_counter()
# Waits for the global step to be done, regardless of done_fn.
if global_step_watcher:
while not global_step_watcher.done():
time.sleep(.25)
if not global_step_watcher:
elapsed_time = loop_end_time - loop_start_time
average_wall_time = elapsed_time / local_step if local_step > 0 else 0
images_per_sec = (self.num_workers * local_step * self.batch_size /
elapsed_time)
num_steps = local_step * self.num_workers
else:
# NOTE: Each worker independently increases the global step. So,
# num_steps will be the sum of the local_steps from each worker.
num_steps = global_step_watcher.num_steps()
elapsed_time = global_step_watcher.elapsed_time()
average_wall_time = (elapsed_time * self.num_workers / num_steps
if num_steps > 0 else 0)
images_per_sec = num_steps * self.batch_size / elapsed_time
# We skip printing images/sec if --eval_during_training_* is specified,
# because we are both processing training and evaluation images, so a
# singular "images/sec" value is meaningless.
if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
log_fn('-' * 64)
# TODO(laigd): rename 'images' to maybe 'inputs'.
log_fn('total images/sec: %.2f' % images_per_sec)
log_fn('-' * 64)
else:
log_fn('Done with training')
num_steps_since_last_eval = local_step - last_eval_step
mlperf.logger.log(
key=mlperf.tags.INPUT_SIZE,
value=num_steps_since_last_eval * self.batch_size)
python_global_step = sess.run(graph_info.global_step)
if eval_graph_info and not skip_final_eval:
log_fn('Running final evaluation at global_step {}'.format(
python_global_step))
accuracy_at_1, accuracy_at_5 = self._eval_once(
sess, summary_writer, eval_graph_info.fetches,
eval_graph_info.summary_op, eval_image_producer, python_global_step)
num_epochs_ran = (python_global_step * self.batch_size /
self.dataset.num_examples_per_epoch('train'))
mlperf.logger.log_train_epochs(num_epochs_ran)
if image_producer is not None:
image_producer.done()
if eval_image_producer is not None:
eval_image_producer.done()
if is_chief:
if self.benchmark_logger:
self.benchmark_logger.log_metric(
'average_examples_per_sec', images_per_sec, global_step=num_steps)
# Save the model checkpoint.
if self.params.train_dir is not None and is_chief:
checkpoint_path = os.path.join(self.params.train_dir, 'model.ckpt')
if not gfile.Exists(self.params.train_dir):
gfile.MakeDirs(self.params.train_dir)
supervisor.saver.save(sess, checkpoint_path, graph_info.global_step)
if graph_info.execution_barrier:
# Wait for other workers to reach the end, so this worker doesn't
# go away underneath them.
sess.run([graph_info.execution_barrier])
stats = {
'num_workers': self.num_workers,
'num_steps': num_steps,
'average_wall_time': average_wall_time,
'images_per_sec': images_per_sec
}
if last_average_loss is not None:
stats['last_average_loss'] = last_average_loss
if accuracy_at_1 is not None:
stats['top_1_accuracy'] = accuracy_at_1
if accuracy_at_5 is not None:
stats['top_5_accuracy'] = accuracy_at_5
success = bool(self.model.reached_target() or
(accuracy_at_1 and self.params.stop_at_top_1_accuracy and
accuracy_at_1 >= self.params.stop_at_top_1_accuracy))
mlperf.logger.log(key=mlperf.tags.RUN_STOP, value={'success': success})
mlperf.logger.log(key=mlperf.tags.RUN_FINAL)
return stats
def _should_eval_during_training(self, step):
"""Return True iff should run eval during training at current step."""
assert self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL
if self.params.eval_during_training_every_n_steps:
return step % self.params.eval_during_training_every_n_steps == 0
# All other --eval_during_training_* flags are converted to step numbers
# at which the model should run evaluation during training.
return step in self.eval_during_training_at_specified_steps
def _preprocess_graph(self, graph, graph_info):
"""Preprocess the graph before executing.
Depending on the params, it runs various preprocessing on the graph,
including freezing, TensorRT conversion, etc.
Args:
graph: the graph to preprocess.
graph_info: the namedtuple returned by _build_graph() which
contains all necessary information to benchmark the graph, including
named tensors/ops list, fetches, etc.
Returns:
The updated graph and graph_info with the ops/tensors/fetches updated
according to the imported graph.
"""
assert isinstance(graph_info.fetches, dict)
assert isinstance(graph_info.global_step, tf.Variable)
if not self.forward_only_and_freeze:
return (graph, graph_info)
# Get the names of the ops that need to keep during conversion.
flattened_op_names = list(
set([
v.name.split(':')[0]
for v in nest.flatten(graph_info)
if v is not None
]))
# Get variables that we don't want to freeze.
# Only keep unfreezable variables in forward_only_and_freeze mode.
# TODO(laigd): consider making global_step a constant.
variables_to_keep = {graph_info.global_step: tf.GraphKeys.GLOBAL_VARIABLES}
variables_to_keep.update({
local_variable: tf.GraphKeys.LOCAL_VARIABLES
for local_variable in self._unfreezable_local_variables(graph)
})
variable_initializers = [
variable.initializer.name for variable in variables_to_keep]
output_node_names = (
flattened_op_names +
# Add variable initializer and read ops to the output list, so
# convert_variables_to_constants() will keep them.
variable_initializers +
[variable.value().op.name for variable in variables_to_keep])
graphdef = graph.as_graph_def(add_shapes=True)
# Freeze the graph.
with graph.as_default():
with tf.Session(config=create_config_proto(self.params)) as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
graphdef = graph_util.convert_variables_to_constants(
sess,
graphdef,
output_node_names,
variable_names_blacklist=[
variable.op.name for variable in variables_to_keep
])
# Run TensorRT conversion.
if self.params.trt_mode:
# Import here instead of at top, because this will crash if TensorRT is
# not installed
from tensorflow.python.compiler.tensorrt import trt_convert # pylint: disable=g-import-not-at-top
# Avoid TF-TRT bridge from touching all variable initializer ops and their
# dependencies, since they can directly be fetched by sess.run()s that
# initialize the variables.
# pylint: disable=protected-access
name_to_input_name, _, _ = graph_util_impl._extract_graph_summary(
graphdef)
initializer_subgraph_ops = graph_util_impl._bfs_for_reachable_nodes(
variable_initializers, name_to_input_name)
# pylint: enable=protected-access
graphdef = trt_convert.create_inference_graph(
graphdef,
outputs=output_node_names + list(initializer_subgraph_ops),
max_batch_size=self.model.get_batch_size(),
max_workspace_size_bytes=self.params.trt_max_workspace_size_bytes,
precision_mode=self.params.trt_mode)
# Creates a new graph as the default and import the converted graph back.
updated_graph = tf.Graph()
def _get_tensors_or_ops(inputs):
"""Gets the updated tensors or ops from 'updated_graph'."""
def _get_fn(element):
if element is None:
return None
if ':' in element.name:
return updated_graph.get_tensor_by_name(element.name)
return updated_graph.get_operation_by_name(element.name)
if isinstance(inputs, (list, dict, tuple)):
return nest.map_structure(_get_fn, inputs)
else:
return _get_fn(inputs)
with updated_graph.as_default():
importer.import_graph_def(graph_def=graphdef, name='')
# Update the variables
for variable in variables_to_keep:
updated_variable = tf.Variable.from_proto(variable.to_proto())
tf.add_to_collection(variables_to_keep[variable], updated_variable)
if variable is graph_info.global_step:
updated_global_step = updated_variable
updated_graph_info = GraphInfo(
input_producer_op=_get_tensors_or_ops(graph_info.input_producer_op),
enqueue_ops=_get_tensors_or_ops(graph_info.enqueue_ops),
execution_barrier=_get_tensors_or_ops(graph_info.execution_barrier),
local_var_init_op_group=_get_tensors_or_ops(
graph_info.local_var_init_op_group),
fetches=_get_tensors_or_ops(graph_info.fetches),
global_step=updated_global_step,
summary_op=None)
return (updated_graph, updated_graph_info)
def _build_input_processing(self, shift_ratio=0):
""""Build the image (pre)processing portion of the model graph.
Args:
shift_ratio: shift_ratio for data_flow_ops.RecordInput.
Returns:
An InputProcessingInfo containing all the input sources to the model.
"""
input_processing_info = InputProcessingInfo(
input_producer_op=None,
input_producer_stages=None,
multi_device_iterator_input=None)
mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
if not self._doing_eval:
mlperf.logger.log(key=mlperf.tags.INPUT_BATCH_SIZE, value=self.batch_size)
# If using synthetic gpu inputs, do nothing on the cpu side.
if self.dataset.use_synthetic_gpu_inputs():
assert not self.datasets_use_prefetch
return input_processing_info
if self._doing_eval:
input_preprocessor = self.eval_input_preprocessor
mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES,
value=self.dataset.num_examples_per_epoch('validation'))
else:
input_preprocessor = self.input_preprocessor
mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_TRAIN_EXAMPLES,
value=self.dataset.num_examples_per_epoch('train'))
# Use prefetching mechanism provided by dataset input pipeline.
if self.datasets_use_prefetch:
multi_device_iterator = (
input_preprocessor.build_multi_device_iterator(
self.batch_size, len(self.devices), self.cpu_device, self.params,
self.raw_devices, self.dataset, self._doing_eval))
return input_processing_info._replace(
multi_device_iterator_input=multi_device_iterator.get_next())
# Not using dataset prefetching. Use a staging area to mimic the prefetching
# behavior instead.
with tf.device(self.cpu_device):
if self._doing_eval:
subset = 'validation'
else:
subset = 'train'
input_list = input_preprocessor.minibatch(
self.dataset,
subset=subset,
params=self.params,
shift_ratio=shift_ratio)
input_producer_op = []
input_producer_stages = []
for device_num in range(len(self.devices)):
staging_area = data_flow_ops.StagingArea(
[parts[0].dtype for parts in input_list],
shapes=[parts[0].get_shape() for parts in input_list],
shared_name='input_producer_staging_area_%d_eval_%s' %
(device_num, self._doing_eval))
input_producer_stages.append(staging_area)
for group_index in xrange(self.batch_group_size):
batch_index = group_index + device_num * self.batch_group_size
put_op = staging_area.put(
[parts[batch_index] for parts in input_list])
input_producer_op.append(put_op)
assert input_producer_op
return input_processing_info._replace(
input_producer_op=input_producer_op,
input_producer_stages=input_producer_stages)
def _maybe_initialize_fp16(self):
"""Initialize fp16 settings."""
if self.params.use_fp16 and not self._doing_eval:
init_loss_scale_val = float(self.params.fp16_loss_scale or
self.model.get_fp16_loss_scale())
self.loss_scale = None
self.loss_scale_normal_steps = None
if self.enable_auto_loss_scale or init_loss_scale_val != 1:
self.loss_scale = tf.get_variable(
name='loss_scale',
initializer=init_loss_scale_val,
dtype=tf.float32,
trainable=False)
if self.enable_auto_loss_scale:
self.loss_scale_normal_steps = tf.get_variable(
name='loss_scale_normal_steps', initializer=0, trainable=False)
def _build_model(self):
"""Build the TensorFlow graph."""
if self.datasets_use_prefetch:
assert not self.params.staged_vars
assert not self.variable_mgr.supports_staged_vars()
# Adjust seed so different workers start read different input files.
if self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
seed_adjustment = hvd.rank()
else:
seed_adjustment = 0
mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
value=self.params.tf_random_seed + seed_adjustment)
tf.set_random_seed(self.params.tf_random_seed + seed_adjustment)
mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
value=4321 + seed_adjustment)
np.random.seed(4321 + seed_adjustment)
phase_train = not (self._doing_eval or self.params.forward_only)
if self._doing_eval:
mode_string = 'evaluation'
else:
mode_string = 'training'
log_fn('Generating {} model'.format(mode_string))
losses = []
device_grads = []
all_logits = []
all_accuracy_ops = {}
gpu_compute_stage_ops = []
gpu_grad_stage_ops = []
with tf.device(self.global_step_device):
global_step = tf.train.get_or_create_global_step()
self._maybe_initialize_fp16()
# Build the processing and model for the worker.
input_producer_op = None
with tf.name_scope('input_processing'):
input_processing_info = self._build_input_processing(shift_ratio=0)
if input_processing_info.input_producer_op is not None:
input_producer_op = tf.group(*input_processing_info.input_producer_op)
update_ops = None
staging_delta_ops = []
for device_num in range(len(self.devices)):
with tf.name_scope('tower_%i' % device_num) as name_scope, (
self.variable_mgr.create_outer_variable_scope(device_num)):
results = self.add_forward_pass_and_gradients(
phase_train, device_num, device_num, input_processing_info,
gpu_compute_stage_ops, gpu_grad_stage_ops)
if self.params.backbone_model_path:
self.model.add_backbone_saver()
if phase_train:
losses.append(results['loss'])
device_grads.append(results['gradvars'])
else:
all_logits.append(results['logits'])
if not phase_train or self.params.print_training_accuracy:
for name, op in results.items():
if name.startswith('accuracy:'):
key = name[9:]
if key not in all_accuracy_ops:
all_accuracy_ops[key] = []
all_accuracy_ops[key].append(op)
if device_num == 0:
# Retain the Batch Normalization updates operations only from the
# first tower. These operations update the moving mean and moving
# variance variables, which are updated (but not used) during
# training, and used during evaluation. The moving mean and variance
# approximate the true mean and variance across all images in the
# dataset. Therefore, in replicated mode, these moving averages would
# be almost identical for each tower, and so we only update and save
# the moving averages for one tower. In parameter server mode, all
# towers share a copy of the variables so we also only need to update
# and save the moving averages once.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)
if self.datasets_use_prefetch:
assert not self.variable_mgr.staging_delta_ops
else:
staging_delta_ops = list(self.variable_mgr.staging_delta_ops)
enqueue_ops = []
if not self.datasets_use_prefetch:
if self.variable_mgr.supports_staged_vars():
for staging_ops in self.variable_mgr.staging_vars_on_devices:
gpu_compute_stage_ops.extend(
[put_op for _, (put_op, _) in six.iteritems(staging_ops)])
enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
name='gpu_compute_stage_ops_group'))
if gpu_grad_stage_ops:
staging_delta_ops += gpu_grad_stage_ops
if staging_delta_ops:
enqueue_ops.append(tf.group(*(staging_delta_ops)))
if (self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL and
self.params.variable_update == 'replicated'):
# We need to get all the update ops instead of only those for the first
# tower. This is because during evaluation, each tower will read from its
# own tower's moving averages instead of the first tower's moving
# averages.
# TODO(reedwm): Have each tower read from the first tower's moving
# averages for a slight performance gain.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
mlperf.logger.log(key=mlperf.tags.INPUT_BN_SPAN,
value=self.batch_size // len(self.raw_devices))
fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
enqueue_ops, update_ops, all_accuracy_ops,
phase_train)
return (input_producer_op, enqueue_ops, fetches)
def _build_fetches(self, global_step, all_logits, losses, device_grads,
enqueue_ops, update_ops, all_accuracy_ops, phase_train):
"""Complete construction of model graph, populating the fetches map."""
fetches = {}
if enqueue_ops:
fetches['enqueue_ops'] = enqueue_ops
for name, ops in all_accuracy_ops.items():
# For fetches that starts with 'tensor:', keep dimension and skip reducing
# them to scalars.
if name.startswith(constants.UNREDUCED_ACCURACY_OP_PREFIX):
key = name[len(constants.UNREDUCED_ACCURACY_OP_PREFIX):]
fetches[key] = tf.concat(ops, 0)
else:
fetches[name] = (
tf.reduce_sum(ops) /
(self.batch_size *
(self.num_workers if self.single_session else 1)))
if self.task_index == 0 and self.params.summary_verbosity >= 1:
tf.summary.scalar(name, fetches[name])
if not phase_train:
if self.params.forward_only:
fetches['all_logits'] = tf.concat(all_logits, 0)
return fetches
apply_gradient_devices, gradient_state = (
self.variable_mgr.preprocess_device_grads(device_grads))
# TODO(reedwm): Greatly simplify the learning rate code.
if (self.params.variable_update == 'horovod' or
self.params.variable_update == 'collective_all_reduce'):
# Each worker independently increments global_step.
examples_per_step = self.batch_size * self.num_workers
else:
# global_step is shared by all workers, and so every iteration
# global_step is incremented by num_workers.
examples_per_step = self.batch_size
if self.params.compute_lr_on_cpu:
with tf.device(self.cpu_device):
learning_rate = get_learning_rate(self.params, global_step,
self.dataset.num_examples_per_epoch(),
self.model, examples_per_step)
training_ops = []
for d, device in enumerate(apply_gradient_devices):
with tf.device(device):
with tf.name_scope('average_loss'):
average_loss = tf.reduce_mean(losses)
with tf.name_scope('get_gradients_to_apply'):
avg_grads = self.variable_mgr.get_gradients_to_apply(d,
gradient_state)
if not self.params.compute_lr_on_cpu:
# We compute the learning rate once for each device in
# `apply_gradient_devices`.
learning_rate = get_learning_rate(
self.params, global_step, self.dataset.num_examples_per_epoch(),
self.model, examples_per_step)
gradient_clip = self.params.gradient_clip
if gradient_clip is not None:
with tf.name_scope('clip_gradients'):
clipped_grads = [(tf.clip_by_value(grad, -gradient_clip,
+gradient_clip), var)
for grad, var in avg_grads]
else:
clipped_grads = avg_grads
learning_rate = tf.identity(learning_rate, name='learning_rate_tensor')
opt = get_optimizer(self.params, learning_rate)
loss_scale_params = variable_mgr_util.AutoLossScaleParams(
enable_auto_loss_scale=self.enable_auto_loss_scale,
loss_scale=self.loss_scale,
loss_scale_normal_steps=self.loss_scale_normal_steps,
inc_loss_scale_every_n=self.params.fp16_inc_loss_scale_every_n,
is_chief=not self.job_name or self.task_index == 0)
with tf.name_scope('append_apply_gradient_ops'):
self.variable_mgr.append_apply_gradients_ops(
gradient_state, opt, clipped_grads, training_ops,
loss_scale_params)
train_op = tf.group(*(training_ops + update_ops), name='train_ops_group')
with tf.device(self.cpu_device):
if self.task_index == 0 and self.params.summary_verbosity >= 1:
tf.summary.scalar('learning_rate', learning_rate)
tf.summary.scalar(self.params.loss_type_to_report, average_loss)
if self.loss_scale is not None:
tf.summary.scalar('loss_scale', self.loss_scale)
if self.loss_scale_normal_steps:
tf.summary.scalar('loss_scale_normal_steps',
self.loss_scale_normal_steps)
if self.params.summary_verbosity >= 2:
self.gradient_histogram_summary(avg_grads)
if self.params.summary_verbosity >= 3:
for grad, var in avg_grads:
if grad is not None:
tf.summary.histogram(var.op.name + '/gradients', grad)
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
fetches['train_op'] = train_op
fetches['average_loss'] = average_loss
return fetches
def gradient_histogram_summary(self, avg_grads):
"""Create histogram of log values of all non-zero gradients."""
with tf.name_scope('log_gradients_summary'):
all_grads = []
for grad, _ in avg_grads:
all_grads.append(tf.reshape(grad, [-1]))
grads = tf.abs(tf.concat(all_grads, 0))
# exclude grads with zero values.
indices_for_non_zero_grads = tf.where(tf.not_equal(grads, 0))
log_grads = tf.reshape(
tf.log(tf.gather(grads, indices_for_non_zero_grads)), [-1])
tf.summary.histogram('log_gradients', log_grads)
def _build_model_single_session(self):
"""Build the TensorFlow graph for multiple replicas in a single_session.
Returns:
input_producer_op:
enqueue_ops:
fetches:
Raises:
ValueError: optimizer not recognized.
Single session runs multiple model replicas as part of one large
distributed graph, whose global execution is always step-synchronized.
"""
# verify assumptions
assert self.params.task_index == 0
assert not self._doing_eval
assert not self.params.forward_only
assert not self.params.staged_vars
tf.set_random_seed(self.params.tf_random_seed)
np.random.seed(4321)
phase_train = True
log_fn('Generating training model')
losses = []
device_grads = []
all_logits = []
all_accuracy_ops = {}
gpu_compute_stage_ops = []
gpu_grad_stage_ops = []
with tf.device(self.global_step_device):
global_step = tf.train.get_or_create_global_step()
update_ops = []
global_input_producer_op = []
is_local = not self.job_name
if is_local:
assert self.num_workers == 1
for task_num in range(self.num_workers):
# Reset the devices that self.variable_mgr knows about to those
# belonging to the next worker (task).
self.reset_devices_for_task(task_num, is_local)
# Build the per-worker image processing
with tf.name_scope('input_processing'):
input_processing_info = self._build_input_processing(
shift_ratio=(task_num / self.num_workers))
if input_processing_info.input_producer_op is not None:
global_input_producer_op.extend(input_processing_info.input_producer_op)
# Build the per-worker model replica.
for rel_device_num in range(len(self.devices)):
abs_device_num = task_num * len(self.devices) + rel_device_num
with self.variable_mgr.create_outer_variable_scope(
abs_device_num), tf.name_scope(
'task_%i_tower_%i' % (task_num, rel_device_num)) as name_scope:
task_results = self.add_forward_pass_and_gradients(
phase_train, rel_device_num, abs_device_num,
input_processing_info, gpu_compute_stage_ops, gpu_grad_stage_ops)
if self.params.backbone_model_path:
self.model.add_backbone_saver()
if phase_train:
losses.append(task_results['loss'])
device_grads.append(task_results['gradvars'])
else:
all_logits.append(task_results['logits'])
if not phase_train or self.params.print_training_accuracy:
for name, op in task_results.items():
if name.startswith('accuracy:'):
key = name[9:]
if key not in all_accuracy_ops:
all_accuracy_ops[key] = []
all_accuracy_ops[key].append(op)
if rel_device_num == 0:
# Retain the Batch Normalization updates operations only
# from the first tower. These operations update the moving
# mean and moving variance variables, which are updated
# (but not used) during training, and used during
# evaluation. The moving mean and variance approximate the
# true mean and variance across all images in the
# dataset. Therefore, in replicated mode, these moving
# averages would be almost identical for each tower, and
# so we only update and save the moving averages for one
# tower. In parameter server mode, all towers share a copy
# of the variables so we also only need to update and save
# the moving averages once.
update_ops.extend(
tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope))
assert not self.variable_mgr.staging_delta_ops
enqueue_ops = []
if gpu_compute_stage_ops:
enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
name='gpu_compute_stage_ops'))
assert not self.variable_mgr.supports_staged_vars()
assert not gpu_grad_stage_ops
fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
enqueue_ops, update_ops, all_accuracy_ops,
phase_train)
if global_input_producer_op:
global_input_producer_op = tf.group(*global_input_producer_op)
else:
global_input_producer_op = None
return (global_input_producer_op, enqueue_ops, fetches)
def add_forward_pass_and_gradients(self,
phase_train,
rel_device_num,
abs_device_num,
input_processing_info,
gpu_compute_stage_ops,
gpu_grad_stage_ops):
"""Add ops for forward-pass and gradient computations."""
nclass = self.dataset.num_classes
if self.datasets_use_prefetch:
assert input_processing_info.multi_device_iterator_input, (
'multi_device_iterator_input cannot be None if '
'datasets_use_prefetch=True')
input_list = (
input_processing_info.multi_device_iterator_input[rel_device_num])
else:
if not self.dataset.use_synthetic_gpu_inputs():
input_producer_stage = input_processing_info.input_producer_stages[
rel_device_num]
with tf.device(self.cpu_device):
host_input_list = input_producer_stage.get()
with tf.device(self.raw_devices[rel_device_num]):
gpu_compute_stage = data_flow_ops.StagingArea(
[inp.dtype for inp in host_input_list],
shapes=[inp.get_shape() for inp in host_input_list])
# The CPU-to-GPU copy is triggered here.
gpu_compute_stage_op = gpu_compute_stage.put(host_input_list)
input_list = gpu_compute_stage.get()
gpu_compute_stage_ops.append(gpu_compute_stage_op)
else:
with tf.device(self.raw_devices[rel_device_num]):
# Minor hack to avoid H2D copy when using synthetic data
input_list = self.model.get_synthetic_inputs(
BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME, nclass)
# Labels reshaping happens all on gpu:0. Reshaping synthetic labels on
# multiple devices slows down XLA computation for an unknown reason.
# TODO(b/116875203): Find/address root cause of XLA slow down.
labels_device_placement_hack = (
self.dataset.use_synthetic_gpu_inputs() and self.params.xla_compile)
def device_aware_reshape(tensor, shape):
device = self.devices[rel_device_num]
# Labels are int32, place reshapes on gpu:0 (no device placement) when the
# hack is enabled.
if labels_device_placement_hack and tensor.dtype == tf.int32:
device = ''
with tf.device(device):
return tf.reshape(tensor, shape=shape)
subset = 'validation' if self._doing_eval else 'train'
input_shapes = self.model.get_input_shapes(subset)
input_list = [
device_aware_reshape(input_list[i], shape=input_shapes[i])
for i in range(len(input_list))
]
def forward_pass_and_gradients():
"""Builds forward pass and gradient computation network.
When phase_train=True and print_training_accuracy=False:
return [loss] + grads
When phase_train=True and print_training_accuracy=True:
return [logits, loss] + grads
When phase_train=False,
return [logits]
Its output can always be unpacked by
```
outputs = forward_pass_and_gradients()
logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
```
Returns:
outputs: A list of tensors depending on different modes.
"""
build_network_result = self.model.build_network(
input_list, phase_train, nclass)
logits = build_network_result.logits
if not phase_train:
return [logits]
base_loss = self.model.loss_function(input_list, build_network_result)
params = self.variable_mgr.trainable_variables_on_device(
rel_device_num, abs_device_num)
l2_loss = None
total_loss = base_loss
with tf.name_scope('l2_loss'):
fp32_params = params
if self.model.data_type == tf.float16 and self.params.fp16_vars:
# fp16 reductions are very slow on GPUs, so cast to fp32 before
# calling tf.nn.l2_loss and tf.add_n.
# TODO(b/36217816): Once the bug is fixed, investigate if we should do
# this reduction in fp16.
fp32_params = (tf.cast(p, tf.float32) for p in params)
filtered_params = self.model.filter_l2_loss_vars(fp32_params)
if rel_device_num == len(self.devices) - 1:
# We compute the L2 loss for only one device instead of all of them,
# because the L2 loss for each device is the same. To adjust for this,
# we multiply the L2 loss by the number of devices. We choose the
# last device because for some reason, on a Volta DGX1, the first four
# GPUs take slightly longer to complete a step than the last four.
# TODO(reedwm): Shard the L2 loss computations across GPUs.
if self.params.single_l2_loss_op:
# TODO(reedwm): If faster, create a fused op that does the L2 loss
# on multiple tensors, and use that instead of concatenating
# tensors.
reshaped_params = [tf.reshape(p, (-1,)) for p in filtered_params]
l2_loss = tf.nn.l2_loss(tf.concat(reshaped_params, axis=0))
else:
l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in filtered_params])
weight_decay = self.params.weight_decay
mlperf.logger.log(key=mlperf.tags.OPT_WEIGHT_DECAY, value=weight_decay)
if (weight_decay is not None and weight_decay != 0. and
l2_loss is not None):
mlperf.logger.log(key=mlperf.tags.MODEL_L2_REGULARIZATION,
value=weight_decay)
total_loss += len(self.devices) * weight_decay * l2_loss
aggmeth = tf.AggregationMethod.DEFAULT
scaled_loss = (total_loss if self.loss_scale is None
else total_loss * self.loss_scale)
grads = tf.gradients(scaled_loss, params, aggregation_method=aggmeth)
if self.params.sparse_to_dense_grads:
# Passing a sparse gradient to convert_to_tensor turns it into a dense
# gradient. A sparse gradient is an instance of tf.IndexedSlices.
# convert_to_tensor does not modify dense tensors.
grads = [tf.convert_to_tensor(g) for g in grads]
if self.loss_scale is not None:
# TODO(reedwm): If automatic loss scaling is not used, we could avoid
# these multiplications by directly modifying the learning rate instead.
# If this is done, care must be taken to ensure that this scaling method
# is correct, as some optimizers square gradients and do other
# operations which might not be compatible with modifying both the
# gradients and the learning rate.
grads = [
grad * tf.cast(1. / self.loss_scale, grad.dtype) for grad in grads
]
if self.params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
if self.params.horovod_device:
horovod_device = '/%s:0' % self.params.horovod_device
else:
horovod_device = ''
# All-reduce gradients using Horovod.
grads = [hvd.allreduce(grad, average=False, device_dense=horovod_device)
for grad in grads]
if self.params.staged_vars:
grad_dtypes = [grad.dtype for grad in grads]
grad_shapes = [grad.shape for grad in grads]
grad_stage = data_flow_ops.StagingArea(grad_dtypes, grad_shapes)
grad_stage_op = grad_stage.put(grads)
# In general, this decouples the computation of the gradients and
# the updates of the weights.
# During the pipeline warm up, this runs enough training to produce
# the first set of gradients.
gpu_grad_stage_ops.append(grad_stage_op)
grads = grad_stage.get()
if self.params.loss_type_to_report == 'total_loss':
loss = total_loss
else:
loss = base_loss
if self.params.print_training_accuracy:
return [logits, loss] + grads
else:
return [loss] + grads
def unpack_forward_pass_and_gradients_output(forward_pass_and_grad_outputs):
"""Unpacks outputs from forward_pass_and_gradients.
Args:
forward_pass_and_grad_outputs: Output from forward_pass_and_gradients.
Returns:
logits: Unscaled probability distribution from forward pass.
If unavailable, None is returned.
loss: Loss function result from logits.
If unavailable, None is returned.
grads: Gradients for all trainable variables.
If unavailable, None is returned.
"""
logits = None
# logits is only fetched in non-train mode or when
# print_training_accuracy is set.
if not phase_train or self.params.print_training_accuracy:
logits = forward_pass_and_grad_outputs.pop(0)
loss = (
forward_pass_and_grad_outputs[0]
if forward_pass_and_grad_outputs else None)
grads = (
forward_pass_and_grad_outputs[1:]
if forward_pass_and_grad_outputs else None)
return logits, loss, grads
def make_results(logits, loss, grads):
"""Generate results based on logits, loss and grads."""
results = {} # The return value
if logits is not None:
results['logits'] = logits
accuracy_ops = self.model.accuracy_function(input_list, logits)
for name, op in accuracy_ops.items():
results['accuracy:' + name] = op
if loss is not None:
results['loss'] = loss
if grads is not None:
param_refs = self.variable_mgr.trainable_variables_on_device(
rel_device_num, abs_device_num, writable=True)
results['gradvars'] = list(zip(grads, param_refs))
return results
with tf.device(self.devices[rel_device_num]):
outputs = maybe_compile(forward_pass_and_gradients, self.params)
logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
return make_results(logits, loss, grads)
def get_input_preprocessor(self):
"""Returns the image preprocessor to used, based on the model.
Returns:
The image preprocessor, or None if synthetic data should be used.
"""
shift_ratio = 0
if self.job_name:
# shift_ratio prevents multiple workers from processing the same batch
# during a step
shift_ratio = self.task_index / self.num_workers
processor_class = self.dataset.get_input_preprocessor(
self.params.input_preprocessor)
assert processor_class
subset = 'validation' if self._doing_eval else 'train'
return processor_class(
self.batch_size * self.batch_group_size,
self.model.get_input_shapes(subset),
len(self.devices) * self.batch_group_size,
dtype=self.model.data_type,
train=(not self._doing_eval),
# TODO(laigd): refactor away image model specific parameters.
distortions=self.params.distortions,
resize_method=self.resize_method,
shift_ratio=shift_ratio,
summary_verbosity=self.params.summary_verbosity,
distort_color_in_yiq=self.params.distort_color_in_yiq,
fuse_decode_and_crop=self.params.fuse_decode_and_crop,
match_mlperf=self.params.ml_perf)
def add_sync_queues_and_barrier(self, name_prefix, enqueue_after_list):
"""Adds ops to enqueue on all worker queues.
Args:
name_prefix: prefixed for the shared_name of ops.
enqueue_after_list: control dependency from ops.
Returns:
An op that should be used as control dependency before starting next step.
"""
self.sync_queue_counter += 1
with tf.device(self.sync_queue_devices[(
self.sync_queue_counter % len(self.sync_queue_devices))]):
sync_queues = [
tf.FIFOQueue(self.num_workers, [tf.bool], shapes=[[]],
shared_name='%s%s' % (name_prefix, i))
for i in range(self.num_workers)]
queue_ops = []
# For each other worker, add an entry in a queue, signaling that it can
# finish this step.
token = tf.constant(False)
with tf.control_dependencies(enqueue_after_list):
for i, q in enumerate(sync_queues):
if i == self.task_index:
queue_ops.append(tf.no_op())
else:
queue_ops.append(q.enqueue(token))
# Drain tokens off queue for this worker, one for each other worker.
queue_ops.append(
sync_queues[self.task_index].dequeue_many(len(sync_queues) - 1))
return tf.group(*queue_ops)
def _is_mkl_flag_absent(mkl_flag):
return not (absl_flags.FLAGS.is_parsed() and mkl_flag in absl_flags.FLAGS
and absl_flags.FLAGS[mkl_flag].present)
def _print_os_env_ignored_warning(mkl_flag, flag_default_val, os_env_var):
tf.logging.warn(
('OS ENV variable %s=%s is ignored and script default: '
'%s is used. Use --%s to override.') %
(os_env_var, os.environ[os_env_var], flag_default_val, mkl_flag))
def set_default_param_values_and_env_vars(params):
"""Sets up the default param values and environment variables ."""
if params.batchnorm_persistent:
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
else:
os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
if params.winograd_nonfused:
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
else:
os.environ.pop('TF_ENABLE_WINOGRAD_NONFUSED', None)
if params.autotune_threshold:
os.environ['TF_AUTOTUNE_THRESHOLD'] = str(params.autotune_threshold)
os.environ['TF_SYNC_ON_FINISH'] = str(int(params.sync_on_finish))
argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Sets environment variables for MKL
# If OS ENV vars are overridden by script defaults, a warning msg is printed.
if params.mkl:
mkl_flags = ['kmp_blocktime', 'kmp_settings', 'kmp_affinity',
'num_intra_threads']
for mkl_flag in mkl_flags:
os_env_var = mkl_flag.upper()
if mkl_flag == 'num_intra_threads':
os_env_var = 'OMP_NUM_THREADS'
flag_val = str(getattr(params, mkl_flag))
if _is_mkl_flag_absent(mkl_flag) and os_env_var in os.environ:
_print_os_env_ignored_warning(mkl_flag, flag_val, os_env_var)
os.environ[os_env_var] = flag_val
if mkl_flag == 'num_intra_threads' and not params.num_intra_threads:
os.environ.pop(os_env_var, None)
# Sets GPU thread settings
if params.device.lower() == 'gpu':
params = params._replace(gpu_thread_mode=params.gpu_thread_mode.lower())
if params.gpu_thread_mode not in ['global', 'gpu_shared', 'gpu_private']:
raise ValueError('Invalid gpu_thread_mode: %s' % params.gpu_thread_mode)
os.environ['TF_GPU_THREAD_MODE'] = params.gpu_thread_mode
if params.per_gpu_thread_count and params.gpu_thread_mode == 'global':
raise ValueError(
'Invalid per_gpu_thread_count with gpu_thread_mode=global: %s' %
params.per_gpu_thread_count)
# Default to two threads. One for the device compute and the other for
# memory copies.
per_gpu_thread_count = params.per_gpu_thread_count or 2
total_gpu_thread_count = per_gpu_thread_count * params.num_gpus
if params.gpu_thread_mode == 'gpu_private':
os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
elif params.gpu_thread_mode == 'gpu_shared':
os.environ['TF_GPU_THREAD_COUNT'] = str(total_gpu_thread_count)
cpu_count = multiprocessing.cpu_count()
if not params.num_inter_threads and params.gpu_thread_mode in [
'gpu_private', 'gpu_shared'
]:
main_thread_count = max(cpu_count - total_gpu_thread_count, 1)
params = params._replace(num_inter_threads=main_thread_count)
if (params.datasets_use_prefetch and
params.datasets_num_private_threads is None):
# From the total cpu thread count, subtract the total_gpu_thread_count,
# and then 2 threads per GPU device for event monitoring and sending /
# receiving tensors
num_monitoring_threads = 2 * params.num_gpus
num_private_threads = max(
cpu_count - total_gpu_thread_count - num_monitoring_threads, 1)
params = params._replace(datasets_num_private_threads=num_private_threads)
return params
def setup(params):
"""Sets up the environment that BenchmarkCNN should run in.
Args:
params: Params tuple, typically created by make_params or
make_params_from_flags.
Returns:
A potentially modified params.
Raises:
ValueError: invalid parames combinations.
"""
# Set up environment variables before doing any other global initialization to
# make sure it uses the appropriate environment variables.
params = set_default_param_values_and_env_vars(params)
# horovod needs to be initialized before create_config_proto() call since
# it will be used in config generation if enabled.
if params.variable_update == 'horovod':
import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top
hvd.init()
platforms_util.initialize(params, create_config_proto(params))
if not params.job_name:
# Create a dummy session to initialize TF global variables using the input
# params. Otherwise, ListDevices function may create global devices using
# the default config instead of using the user provided config.
#
# TODO(hinsu): Find a way to achieve the same for distributed benchmark. It
# is not legal to create distributed session after local session. It is also
# not possible to create distributed session here as that results in
# multiple creation of ClusterManager and Server.
with tf.Session(config=create_config_proto(params)) as sess:
del sess
return params
def maybe_compile(computation, params):
if params and params.xla_compile:
return tf.xla.experimental.compile(computation)
else:
return computation()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for CNN benchmarks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import threading
import numpy as np
import tensorflow.compat.v1 as tf
def tensorflow_version_tuple():
v = tf.__version__
major, minor, patch = v.split('.')
return (int(major), int(minor), patch)
def tensorflow_version():
vt = tensorflow_version_tuple()
return vt[0] * 1000 + vt[1]
def log_fn(log):
print(log, flush=True)
def roll_numpy_batches(array, batch_size, shift_ratio):
"""Moves a proportion of batches from start to the end of the array.
This function moves a proportion of batches, specified by `shift_ratio`, from
the starts of the array to the end. The number of batches moved is rounded
down to the nearest integer. For example,
```
roll_numpy_batches([1, 2, 3, 4, 5, 6], 2, 0.34) == [3, 4, 5, 6, 1, 2]
```
Args:
array: A Numpy array whose first dimension is the batch dimension.
batch_size: The batch size.
shift_ratio: Proportion of batches to move from the start of the array to
the end of the array.
Returns:
A new Numpy array, with a proportion of the batches at the start of `array`
moved to the end.
"""
num_items = array.shape[0]
assert num_items % batch_size == 0
num_batches = num_items // batch_size
starting_batch = int(num_batches * shift_ratio)
starting_item = starting_batch * batch_size
return np.roll(array, -starting_item, axis=0)
# For Python 2.7 compatibility, we do not use threading.Barrier.
class Barrier(object):
"""Implements a lightweight Barrier.
Useful for synchronizing a fixed number of threads at known synchronization
points. Threads block on 'wait()' and simultaneously return once they have
all made that call.
# Implementation adopted from boost/thread/barrier.hpp
"""
def __init__(self, parties):
"""Create a barrier, initialised to 'parties' threads."""
self.cond = threading.Condition(threading.Lock())
self.parties = parties
# Indicates the number of waiting parties.
self.waiting = 0
# generation is needed to deal with spurious wakeups. If self.cond.wait()
# wakes up for other reasons, generation will force it go back to wait().
self.generation = 0
self.broken = False
def wait(self):
"""Wait for the barrier."""
with self.cond:
# Check if the barrier has been disabled or not.
if self.broken:
return
gen = self.generation
self.waiting += 1
if self.waiting == self.parties:
self.waiting = 0
self.generation += 1
self.cond.notify_all()
# loop because of spurious wakeups
while gen == self.generation:
self.cond.wait()
# TODO(huangyp): Remove this method once we find a way to know which step
# is the last barrier.
def abort(self):
"""Clear existing barrier and disable this barrier."""
with self.cond:
if self.waiting > 0:
self.generation += 1
self.cond.notify_all()
self.broken = True
class ImageProducer(object):
"""An image producer that puts images into a staging area periodically.
This class is useful for periodically running a set of ops, `put_ops` on a
different thread every `batch_group_size` steps.
The notify_image_consumption() method is used to increment an internal counter
so that every `batch_group_size` times it is called, `put_ops` is executed. A
barrier is placed so that notify_image_consumption() will block until
the previous call to `put_ops` has been executed.
The start() method is used to start the thread that runs `put_ops`.
The done() method waits until the last put_ops is executed and stops the
thread.
The purpose of this class is to fill an image input pipeline every
`batch_group_size` steps. Suppose `put_ops` supplies `batch_group_size` images
to the input pipeline when run, and that every step, 1 batch of images is
consumed. Then, by calling notify_image_consumption() every step, images are
supplied to the input pipeline at the same amount they are consumed.
Example usage:
```
put_ops = ... # Enqueues `batch_group_size` batches to a StagingArea
get_op = ... # Dequeues 1 batch, and does some operations on it
batch_group_size = 4
with tf.Session() as sess:
image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size)
image_producer.start()
for _ in range(100):
sess.run(get_op)
image_producer.notify_image_consumption()
```
"""
def __init__(self, sess, put_ops, batch_group_size, use_python32_barrier):
self.sess = sess
self.num_gets = 0
self.put_ops = put_ops
self.batch_group_size = batch_group_size
self.done_event = threading.Event()
if (use_python32_barrier and
sys.version_info[0] == 3 and sys.version_info[1] >= 2):
self.put_barrier = threading.Barrier(2)
else:
self.put_barrier = Barrier(2)
def _should_put(self):
return (self.num_gets + 1) % self.batch_group_size == 0
def done(self):
"""Stop the image producer."""
self.done_event.set()
self.put_barrier.abort()
self.thread.join()
def start(self):
"""Start the image producer."""
self.sess.run([self.put_ops])
self.thread = threading.Thread(target=self._loop_producer)
# Set daemon to true to allow Ctrl + C to terminate all threads.
self.thread.daemon = True
self.thread.start()
def notify_image_consumption(self):
"""Increment the counter of image_producer by 1.
This should only be called by the main thread that consumes images and runs
the model computation. One batch of images should be consumed between
calling start() and the first call to this method. Then, one batch of images
should be consumed between any two successive calls to this method.
"""
if self._should_put():
self.put_barrier.wait()
self.num_gets += 1
def _loop_producer(self):
while not self.done_event.isSet():
self.sess.run([self.put_ops])
self.put_barrier.wait()
class BaseClusterManager(object):
"""The manager for the cluster of servers running the benchmark."""
def __init__(self, params):
worker_hosts = params.worker_hosts.split(',')
ps_hosts = params.ps_hosts.split(',') if params.ps_hosts else []
cluster = {'worker': worker_hosts}
if ps_hosts:
cluster['ps'] = ps_hosts
self._cluster_spec = tf.train.ClusterSpec(cluster)
def get_target(self):
"""Returns a target to be passed to tf.Session()."""
raise NotImplementedError('get_target must be implemented by subclass')
def join_server(self):
raise NotImplementedError('join must be implemented by subclass')
def get_cluster_spec(self):
return self._cluster_spec
def num_workers(self):
return len(self._cluster_spec.job_tasks('worker'))
def num_ps(self):
if 'ps' in self._cluster_spec.jobs:
return len(self._cluster_spec.job_tasks('ps'))
else:
return 0
class GrpcClusterManager(BaseClusterManager):
"""A cluster manager for a cluster networked with gRPC."""
def __init__(self, params, config_proto):
super(GrpcClusterManager, self).__init__(params)
if params.job_name == 'controller':
self._target = 'grpc://%s' % self._cluster_spec.job_tasks('worker')[0]
else:
self._server = tf.train.Server(self._cluster_spec,
job_name=params.job_name,
task_index=params.task_index,
config=config_proto,
protocol=params.server_protocol)
self._target = self._server.target
def get_target(self):
return self._target
def join_server(self):
return self._server.join()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment