del tensorflow benchmark cls

6b6f8b0c · huchen · 4749cd5e · 4749cd5e · 4749cd5e · 4749cd5e
Commit 6b6f8b0c authored Apr 15, 2022 by huchen
20 changed files
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/flags.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/flags.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/mlperf.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/mlperf.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/preprocessing.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/preprocessing.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/ssd_constants.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/ssd_constants.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/variable_mgr.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/variable_mgr.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/variable_mgr_util.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/__pycache__/variable_mgr_util.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Benchmarks the all-reduce algorithms of tf_cnn_benchmarks.
-tf_cnn_benchmarks uses all-reduce to aggregate gradients. This benchmark is
-useful for benchmarking the performance of just this gradient aggregation,
-instead of the entire model. All the flags that tf_cnn_benchmarks accepts are
-also accepted by this script, although many are silently ignored.
-The number and shapes of the tensors all-reduced are those of the variables of
-the model specified by the --model flag.
-TODO(reedwm): Allow custom sizes to be specified.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import time
-from absl import app
-from absl import flags as absl_flags
-import tensorflow.compat.v1 as tf
-from tensorflow.python.ops import control_flow_ops
-import benchmark_cnn
-import cnn_util
-import flags
-from cnn_util import log_fn
-absl_flags.DEFINE_integer('iters_per_step', 5,
-                          'Number of iterations to run all-reduce for, per '
-                          'step. Every step, a session will be run on a Graph '
-                          'that contains this many copies of the all-reduce. '
-                          'The copies are run sequentially. Setting this above '
-                          '1 is useful to lower the overhead of starting the '
-                          'session run, running the VariableV2 ops at the '
-                          'start of the step, etc.')
-flags.define_flags()
-for name in flags.param_specs.keys():
-  absl_flags.declare_key_flag(name)
-def get_var_shapes(model):
-  """Returns the list of variable shapes for a tf_cnn_benchmarks Model."""
-  with tf.Graph().as_default():
-    # The variable shapes do not depend on the batch size.
-    images = tf.placeholder(tf.float32, model.get_input_shapes('train')[0])
-    model.build_network([images])
-    return [[int(d) for d in v.shape.dims] for v in tf.trainable_variables()]
-def all_reduce(all_device_tensors, variable_mgr):
-  """Performs a single batch all-reduce.
-  Args:
-    all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
-      a tensor, where t is the tower the tensor is on and i is the index of
-      the tensor.
-    variable_mgr: The VariableMgr to perform the all-reduce.
-  Returns:
-    List of list of tensors in the same form as `all_device_tensors`, except the
-    tensors are aggregated across towers.
-  """
-  tower_grads = [[(g, None) for g in device_tensors] for
-                 device_tensors in all_device_tensors]
-  _, aggregated_tower_grads = variable_mgr.preprocess_device_grads(tower_grads)
-  return [
-      [g for g, _ in agg_device_tensors]
-      for agg_device_tensors in aggregated_tower_grads]
-def build_all_reduce_iterations(all_device_tensors, tower_devices, variable_mgr,
-                                num_iters):
-  """Builds the all-reduce ops for multiple iterations to aggregate tensors.
-  The tensors in `all_device_tensors` are aggregated `num_iters` times. Each
-  iteration aggregates the results from the previous iteration. The iterations
-  are run sequentially, so the aggregations for an iteration do not start
-  running until the previous iteration has completed. Each iteration after the
-  first is aggregating already-aggregated values, but it does not matter because
-  we are only aggregating for benchmarking purposes.
-  Args:
-    all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
-      a tensor, where t is the tower the tensor is on and i is the index of
-      the tensor.
-    tower_devices: A list of device strings. tower_devices[t] is the device
-      of the tensors in all_device_tensors[t].
-    variable_mgr: The VariableMgr to perform the all-reduce.
-    num_iters: Number of iterations to aggregate tensors for.
-  Returns:
-    An op that when run, causes the all-reduce ops to run.
-  """
-  for i in range(num_iters):
-    with tf.name_scope('iteration_%d' % i):
-      # Step 1: Do the aggregation.
-      with tf.name_scope('tensor_aggregation'):
-        all_device_tensors = all_reduce(all_device_tensors, variable_mgr)
-      # Step 2. Create identity ops, to bring the aggregated results back to
-      # each device.
-      new_all_device_tensors = []
-      for device, device_tensors in zip(tower_devices, all_device_tensors):
-        with tf.device(device):
-          new_all_device_tensors.append([
-              tf.identity(t, name='identity_after_allreduce')
-              for t in device_tensors
-          ])
-      all_device_tensors = new_all_device_tensors
-      # Step 3. Add control dependencies to delay the next iteration until this
-      # iteration is complete. To avoid extra overhead, we do not have any
-      # cross-device control dependencies, which means it's possible for two
-      # iterations to slightly overlap.
-      new_all_device_tensors = []
-      for device_tensors in all_device_tensors:
-        new_all_device_tensors.append([
-            control_flow_ops.with_dependencies(
-                device_tensors, t, name='identity_after_dependencies')
-            for t in device_tensors
-        ])
-      all_device_tensors = new_all_device_tensors
-  # To prevent the dependency optimizer from removing every op we created,
-  # we store the results in variables.
-  ops_to_run = []
-  for device, device_tensors in zip(tower_devices, all_device_tensors):
-    with tf.device(device):
-      for t in device_tensors:
-        # The placeholder initial value is never run.
-        var = tf.Variable(tf.placeholder(tf.float32, t.shape), collections=[])
-        ops_to_run.append(var.assign(t))
-  return tf.group(*ops_to_run)
-def build_graph(tower_devices, tensor_shapes, variable_mgr, num_iters):
-  """Builds the graph for the benchmark.
-  Args:
-    tower_devices: A list of device strings of the devices to run the all-reduce
-      benchmark on.
-    tensor_shapes: A list of shapes of the tensors that will be aggregated for
-      the all-reduce.
-    variable_mgr: The VariableMgr to perform the all-reduce.
-    num_iters: Number of iterations to aggregate tensors for.
-  Returns:
-    An op that runs the benchmark.
-  """
-  all_device_tensors = []
-  for i, tower_device in enumerate(tower_devices):
-    with tf.device(tower_device):
-      device_tensors = []
-      for j, shape in enumerate(tensor_shapes):
-        tensor = tf.Variable(tf.random_normal(shape, dtype=tf.float32),
-                             name='tensor_%d_on_device_%d' % (j, i))
-        device_tensors.append(tensor)
-    all_device_tensors.append(device_tensors)
-  log_fn('Building all-reduce ops')
-  benchmark_op = build_all_reduce_iterations(all_device_tensors, tower_devices,
-                                             variable_mgr, num_iters)
-  log_fn('Done building all-reduce ops')
-  return benchmark_op
-def run_graph(benchmark_op, bench_cnn, init_ops, dummy_loss_op):
-  """Runs the graph for the benchmark.
-  Args:
-    benchmark_op: An op that runs the benchmark.
-    bench_cnn: The BenchmarkCNN where params and other attributes are obtained.
-    init_ops: A list of ops that are run before `benchmark_op` for
-      initialization.
-    dummy_loss_op: Any op. We must pass a loss op to
-      `benchmark_cnn.benchmark_one_step`, but the result of the op is never
-      actually used.
-  """
-  config = benchmark_cnn.create_config_proto(bench_cnn.params)
-  with tf.Session(config=config) as sess:
-    for op in init_ops:
-      sess.run(op)
-    step_train_times = []
-    fetches = {'average_loss': dummy_loss_op, 'benchmark_op': benchmark_op}
-    log_fn('Running warmup')
-    for i in range(-bench_cnn.num_warmup_batches, bench_cnn.num_batches):
-      if i == 0:
-        log_fn('Running all-reduce ops')
-        start = time.time()
-      if i > 0 and i % bench_cnn.params.display_every == 0:
-        log_fn('Iteration: %d. Average time per step so far: %s' %
-               (i, (time.time() - start) / i))
-      # Call benchmark_one_step instead of directly calling sess.run(...), to
-      # potentially get a trace file, partitioned graphs, etc.
-      benchmark_cnn.benchmark_one_step(
-          sess=sess,
-          fetches=fetches,
-          step=i,
-          # The batch size is only used for the images/sec calculation, which is
-          # not actually calculated because we pass show_images_per_sec=False.
-          batch_size=None,
-          step_train_times=step_train_times,
-          trace_filename=bench_cnn.trace_filename,
-          partitioned_graph_file_prefix=(
-              bench_cnn.params.partitioned_graph_file_prefix),
-          profiler=None,
-          image_producer=None,
-          params=bench_cnn.params,
-          show_images_per_sec=False)
-    log_fn('Average time per step: %s' %
-           ((time.time() - start) / bench_cnn.num_batches))
-def run_benchmark(bench_cnn, num_iters):
-  """Runs the all-reduce benchmark.
-  Args:
-    bench_cnn: The BenchmarkCNN where params, the variable manager, and other
-      attributes are obtained.
-    num_iters: Number of iterations to do all-reduce for for.
-  Raises:
-    ValueError: Invalid params of bench_cnn.
-  """
-  if bench_cnn.params.variable_update != 'replicated':
-    raise ValueError('--variable_update=replicated must be specified to use'
-                     'the all-reduce benchmark')
-  if bench_cnn.params.variable_consistency == 'relaxed':
-    raise ValueError('--variable_consistency=relaxed is not supported')
-  benchmark_op = build_graph(bench_cnn.raw_devices,
-                             get_var_shapes(bench_cnn.model),
-                             bench_cnn.variable_mgr, num_iters)
-  init_ops = [
-      tf.global_variables_initializer(),
-      bench_cnn.variable_mgr.get_post_init_ops()
-  ]
-  loss_op = tf.no_op()
-  if bench_cnn.graph_file:
-    path, filename = os.path.split(bench_cnn.graph_file)
-    as_text = filename.endswith('txt')
-    log_fn('Writing GraphDef as %s to %s' % (
-        'text' if as_text else 'binary', bench_cnn.graph_file))
-    tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
-                         path, filename, as_text)
-  run_graph(benchmark_op, bench_cnn, init_ops, loss_op)
-# TODO(reedwm): Reduce redundancy with tf_cnn_benchmarks
-def main(positional_arguments):
-  # Command-line arguments like '--distortions False' are equivalent to
-  # '--distortions=True False', where False is a positional argument. To prevent
-  # this from silently running with distortions, we do not allow positional
-  # arguments.
-  assert len(positional_arguments) >= 1
-  if len(positional_arguments) > 1:
-    raise ValueError('Received unknown positional arguments: %s'
-                     % positional_arguments[1:])
-  params = benchmark_cnn.make_params_from_flags()
-  params = benchmark_cnn.setup(params)
-  bench = benchmark_cnn.BenchmarkCNN(params)
-  tfversion = cnn_util.tensorflow_version_tuple()
-  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
-  run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  app.run(main)  # Raises error on invalid flags, unlike tf.app.run()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark_test.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for all_reduce_benchmark.py."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import tensorflow.compat.v1 as tf
-import all_reduce_benchmark
-import benchmark_cnn
-import test_util
-class AllReduceBenchmarkTest(tf.test.TestCase):
-  """Tests the all-reduce benchmark."""
-  def _test_run_benchmark(self, params):
-    """Tests that run_benchmark() runs successfully with the params."""
-    logs = []
-    with test_util.monkey_patch(all_reduce_benchmark,
-                                log_fn=test_util.print_and_add_to_list(logs)):
-      bench_cnn = benchmark_cnn.BenchmarkCNN(params)
-      all_reduce_benchmark.run_benchmark(bench_cnn, num_iters=5)
-      self.assertRegex(logs[-1], '^Average time per step: [0-9.]+$')
-  def test_run_benchmark(self):
-    """Tests that run_benchmark() runs successfully."""
-    params = benchmark_cnn.make_params(num_batches=10,
-                                       variable_update='replicated',
-                                       num_gpus=2)
-    self._test_run_benchmark(params)
-    params = params._replace(hierarchical_copy=True, gradient_repacking=8,
-                             num_gpus=8)
-    self._test_run_benchmark(params)
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  tf.test.main()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/allreduce.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/allreduce.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for allreduce."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import collections as pycoll
-import re
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow.compat.v1 as tf
-# pylint: disable=g-direct-tensorflow-import
-from tensorflow.python.distribute import all_reduce
-from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import collective_ops
-AllReduceSpecTuple = pycoll.namedtuple('AllReduceSpecTuple', 'alg shards limit')
-def parse_general_int(s):
-  """Parse integer with power-of-2 suffix eg. 32k."""
-  mo = re.match(r'(\d+)([KkMGT]?)$', s)
-  if mo:
-    i, suffix = mo.group(1, 2)
-    v = int(i)
-    if suffix:
-      if suffix == 'K' or suffix == 'k':
-        v *= 1024
-      elif suffix == 'M':
-        v *= (1024 * 1024)
-      elif suffix == 'G':
-        v *= (1024 * 1024 * 1024)
-      elif suffix == 'T':
-        v *= (1024 * 1024 * 1024 * 1024)
-      else:
-        raise ValueError('invalid integer string %s' % s)
-    return v
-  else:
-    v = int(s)
-  return v
-def parse_all_reduce_spec(all_reduce_spec):
-  """Parse all_reduce_spec.
-  Args:
-    all_reduce_spec: a string specifying a combination of all-reduce
-      algorithms to apply for gradient reduction.
-  Returns:
-    a list of AllReduceSpecTuple.
-  Raises:
-    ValueError: all_reduce_spec is not well-formed.
-  An all_reduce_spec has BNF form:
-     int ::= positive whole number
-     g_int ::= int[KkMGT]?
-     alg_spec ::= alg | alg#int
-     range_spec ::= alg_spec | alg_spec/alg_spec
-     spec ::= range_spec | range_spec:g_int:range_spec
-  Not all syntactically correct specifications are supported.
-  Examples of supported all_reduce_spec strings, with semantics explained:
-    'collective' == apply tf.collective_reduce operator to all tensors.
-    'collective#2' == apply tf.collective_reduce operator to all tensors,
-            requesting up to 2 simultaneous transfers at each node, if
-            feasible, by subdividing tensor by an additional factor of 2.
-    'xring' == apply ring all-reduce to all tensors
-    'xring#2' == apply ring all-reduce to all tensors, using two simultaneous
-            transfer rings, each operating on 1/2 of each tensor.
-    'nccl'  == apply NCCL all-reduce to all tensors (only works within
-            a single worker process where all devices are GPUs)
-    'nccl/xring' == apply NCCL all-reduce to all tensors within each worker
-            to produce at least one full-reduced (locally) value,
-            then apply ring all-reduce to one such value from each
-            worker, then apply NCCL broadcast to propagate those globally
-            reduced values back to every device within each worker.
-    'pscpu' == Shuffle reduce using worker CPUs as the gather devices: each
-            distributed tensor is reduced by copying all instances to
-            one of the worker CPUs, computing the reduction there, then
-            copying back to each participating device.  Tensor reductions
-            are assigned to specific CPUs round-robin.
-    'psgpu#4' == Arrange all GPUs across all workers into groups of 4.
-            Each distributed tensor is shuffle reduced against one
-            such group of 4 GPUs, selected round-robin.  That is, each
-            tensor is split across 4 shards for the reduction.
-    'pscpu:2k:pscpu#2:64k:xring' == Apply single-shard pscpu to
-            tensors of size <= 2048 elements, apply 2-shard pscpu to
-            tensors up to size 64k elements, apply xring to larger tensors.
-    'pscpu/pscpu#2' == Use shuffle gather to locally reduce each tensor on
-            the worker's CPU, then use 2-shard shuffle to reduce those
-            locally reduced tensors across workers (on the worker CPUs), then
-            scatter the globally reduced values locally from each worker CPU.
-  """
-  range_parts = all_reduce_spec.split(':') + ['-1']
-  if len(range_parts) % 2:
-    raise ValueError('all_reduce_spec not well formed: %s' % all_reduce_spec)
-  limit = 0
-  spec = []
-  alg = None
-  shards = 1
-  for i, range_part in enumerate(range_parts):
-    if i % 2 == 1:
-      try:
-        limit = parse_general_int(range_part)
-        spec.append(AllReduceSpecTuple(alg=alg, shards=shards, limit=limit))
-      except ValueError:
-        raise ValueError('all_reduce_spec (%s) contains non-integer range %s' %
-                         (all_reduce_spec, range_part))
-    else:
-      alg = range_part
-      alg_parts = range_part.split('#')
-      alg = alg_parts[0]
-      if len(alg_parts) > 1:
-        try:
-          shards = int(alg_parts[1])
-        except ValueError:
-          raise ValueError('all_reduce_spec (%s) contains non-integer '
-                           'shards %s' % all_reduce_spec, alg_parts[1])
-      else:
-        shards = 1
-      if alg not in [
-          'nccl', 'nccl/xring', 'nccl/rechd', 'nccl/pscpu', 'xring', 'pscpu',
-          'psgpu', 'pscpu/pscpu', 'collective'
-      ]:
-        raise ValueError('all_reduce_spec (%s) contains invalid alg %s' %
-                         (all_reduce_spec, alg))
-  return spec
-def build_all_reduce_device_prefixes(job_name, num_tasks):
-  """Build list of device prefix names for all_reduce.
-  Args:
-    job_name: 'worker', 'ps' or 'localhost'.
-    num_tasks: number of jobs across which device names should be generated.
-  Returns:
-     A list of device name prefix strings. Each element spells out the full
-     host name without adding the device.
-     e.g. '/job:worker/task:0'
-  """
-  if job_name != 'localhost':
-    return ['/job:%s/task:%d' % (job_name, d) for d in range(0, num_tasks)]
-  else:
-    assert num_tasks == 1
-    return ['/job:%s' % job_name]
-def group_device_names(devices, group_size):
-  """Group device names into groups of group_size.
-  Args:
-    devices: list of strings naming devices.
-    group_size: int >= 1
-  Returns:
-    list of lists of devices, where each inner list is group_size long,
-      and each device appears at least once in an inner list.  If
-      len(devices) % group_size = 0 then each device will appear
-      exactly once.
-  Raises:
-    ValueError: group_size > len(devices)
-  """
-  num_devices = len(devices)
-  if group_size > num_devices:
-    raise ValueError('only %d devices, but group_size=%d' % (num_devices,
-                                                             group_size))
-  num_groups = (
-      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
-  groups = [[] for i in range(num_groups)]
-  for i in range(0, num_groups * group_size):
-    groups[i % num_groups].append(devices[i % num_devices])
-  return groups
-def split_grads_by_size(threshold_size, device_grads):
-  """Break gradients into two sets according to tensor size.
-  Args:
-    threshold_size: int size cutoff for small vs large tensor.
-    device_grads: List of lists of (gradient, variable) tuples.  The outer
-        list is over devices. The inner list is over individual gradients.
-  Returns:
-    small_grads: Subset of device_grads where shape is <= theshold_size
-       elements.
-    large_grads: Subset of device_grads where shape is > threshold_size
-       elements.
-  """
-  small_grads = []
-  large_grads = []
-  for dl in device_grads:
-    small_dl = []
-    large_dl = []
-    for (g, v) in dl:
-      tensor_size = g.get_shape().num_elements()
-      if tensor_size <= threshold_size:
-        small_dl.append([g, v])
-      else:
-        large_dl.append([g, v])
-    if small_dl:
-      small_grads.append(small_dl)
-    if large_dl:
-      large_grads.append(large_dl)
-  return small_grads, large_grads
-_instance_key = 1
-def new_collective_instance_key():
-  """Returns a new instance key for use in defining a collective op."""
-  global _instance_key
-  v = _instance_key
-  _instance_key += 1
-  return v
-_group_key = 1
-_group_key_table = dict()
-def collective_group_key(devices):
-  """Returns a group key for the set of devices.
-  Args:
-    devices: list of strings naming devices in a collective group.
-  Returns:
-    int key uniquely identifying the set of device names.
-  """
-  global _group_key
-  global _group_key_table
-  parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
-  names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
-  concat = ','.join(names)
-  if concat not in _group_key_table.keys():
-    new_key = _group_key
-    _group_key += 1
-    _group_key_table[concat] = new_key
-  rv = _group_key_table[concat]
-  return rv
-def build_collective_reduce(input_tensors, num_workers, num_shards,
-                            red_op='Add', un_op='Id'):
-  """Build a subgraph that does one full all-reduce, using the collective Op.
-  Args:
-    input_tensors: tensors within a single worker graph that are to be reduced
-      together; must be one per device.
-    num_workers: total number of workers with identical independent graphs that
-      will be doing this same reduction.  The reduction will actually include
-      the corresponding tensors at all these workers.
-    num_shards: number of shards into which to divide each per-tick chunk,
-      normally 1 but could be higher on multi-data-path architectures.
-    red_op: string naming the reduction op
-    un_op: string naming the unary final op
-  Returns:
-    An array of final tensors, one per device, computed by the full reduction.
-  Raises:
-    ValueError: There must be at least two tensors over all the workers.
-  """
-  group_size = len(input_tensors) * num_workers
-  if group_size < 2:
-    raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
-  devices = [t.device for t in input_tensors]
-  num_devices = len(devices)
-  group_key = collective_group_key(devices)
-  instance_key = new_collective_instance_key()
-  out_tensors = []
-  if num_shards == 1:
-    subdiv_offsets = [0]
-  elif num_shards == 2:
-    if num_devices > 1:
-      subdiv_offsets = [0, -(num_devices // 2)]
-    else:
-      subdiv_offsets = [0]
-  else:
-    raise ValueError('Unsupported num_shards %d' % num_shards)
-  for d in range(num_devices):
-    with ops.device(devices[d]):
-      reduce_op = collective_ops.all_reduce(input_tensors[d],
-                                            group_size, group_key, instance_key,
-                                            red_op, un_op,
-                                            subdiv_offsets)
-      out_tensors.append(reduce_op)
-  return out_tensors
-def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
-  return collective_ops.broadcast_send(t, shape, dtype, group_size, group_key,
-                                       instance_key)
-def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
-  return collective_ops.broadcast_recv(shape, dtype, group_size, group_key,
-                                       instance_key)
-def sum_grad_and_var_all_reduce(single_session,
-                                grad_and_vars,
-                                num_workers,
-                                alg,
-                                gpu_indices,
-                                aux_devices=None,
-                                num_shards=1):
-  """Apply all-reduce algorithm over specified gradient tensors."""
-  scaled_grads = [g for g, _ in grad_and_vars]
-  if alg == 'collective':
-    assert not single_session
-    summed_grads = build_collective_reduce(
-        scaled_grads, num_workers, num_shards, 'Add', 'Id')
-  else:
-    with tf.name_scope('allreduce'):
-      # Note that each grad_and_vars looks like the following:
-      #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-      if alg == 'nccl':
-        summed_grads = all_reduce.build_nccl_all_reduce(scaled_grads, tf.add)
-      elif alg == 'xring':
-        summed_grads = all_reduce.build_ring_all_reduce(
-            scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
-      elif alg == 'nccl/xring':
-        summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
-                                                       tf.add)
-      elif alg == 'nccl/rechd':
-        summed_grads = all_reduce.build_nccl_then_recursive_hd(
-            scaled_grads, tf.add)
-      elif alg == 'nccl/pscpu':
-        summed_grads = all_reduce.build_nccl_then_shuffle(
-            scaled_grads, aux_devices, tf.add, tf.add_n)
-      elif alg == 'pscpu/pscpu':
-        summed_grads = all_reduce.build_shuffle_then_shuffle(
-            scaled_grads,
-            aux_devices,
-            # TODO(tucker): devise a way of better specifying the device set
-            # for the second level.
-            [aux_devices[0]],
-            tf.add_n)
-      elif alg in ['pscpu', 'psgpu']:
-        summed_grads = all_reduce.build_shuffle_all_reduce(
-            scaled_grads, aux_devices, tf.add_n)
-      else:
-        raise ValueError('unsupported all_reduce alg: ', alg)
-  result = []
-  for (_, v), g in zip(grad_and_vars, summed_grads):
-    result.append([g, v])
-  return result
-def contains_any(haystack, needles):
-  """Tests if any needle is a substring of haystack.
-  Args:
-    haystack: a string
-    needles: list of strings
-  Returns:
-    True if any element of needles is a substring of haystack,
-      False otherwise.
-  """
-  for n in needles:
-    if n in haystack:
-      return True
-  return False
-def sum_gradients_all_reduce(single_session,
-                             dev_prefixes,
-                             tower_grads,
-                             num_workers,
-                             alg,
-                             num_shards,
-                             gpu_indices,
-                             agg_small_grads_max_bytes=0,
-                             agg_small_grads_max_group=10,
-                             allreduce_merge_scope=1):
-  """Apply all-reduce algorithm over specified gradient tensors.
-  Args:
-    single_session: true if reduction is applied to one graph across
-      all workers, false if ths application is to a single-worker graph only.
-    dev_prefixes: list of prefix strings to use to generate PS device names.
-    tower_grads: the gradients to reduce.
-    num_workers: number of worker processes across entire job.
-    alg: the all-reduce algorithm to apply.
-    num_shards: alg-specific sharding factor.
-    gpu_indices: indices of local GPUs in order usable for ring-reduce.
-    agg_small_grads_max_bytes: largest tensor eligible for aggregation,
-      in number of bytes.
-    agg_small_grads_max_group: largest permitted aggregation of small
-      tensors.
-    allreduce_merge_scope: size of groups into which to partition consecutive
-      gradients grouped under a common 'allreduce' name scope for application
-      of ScopedAllocator optimization.
-  Returns:
-    list of reduced tensors
-  """
-  alg_contains_shuffle = contains_any(alg, ['pscpu', 'psgpu'])
-  is_hierarchical = '/' in alg
-  if 'pscpu' in alg:
-    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
-  elif 'psgpu' in alg:
-    aux_devices = [
-        prefix + '/gpu:%d' % i
-        for i in range(len(gpu_indices))
-        for prefix in dev_prefixes
-    ]
-  else:
-    aux_devices = ['/job:localhost/cpu:0']
-  aux_device_groups = group_device_names(
-      aux_devices,
-      num_shards if (alg != 'collective' and alg_contains_shuffle) else 1)
-  group_index = 0
-  if agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
-    tower_grads, packing = pack_small_tensors(
-        tower_grads,
-        max_bytes=agg_small_grads_max_bytes,
-        max_group=agg_small_grads_max_group)
-  else:
-    packing = None
-  reduced_gv_list = []
-  gv = list(zip(*tower_grads))
-  merge_scope = allreduce_merge_scope if allreduce_merge_scope > 0 else 1
-  chunked_gv = [gv[x:x + merge_scope]
-                for x in xrange(0, len(gv), merge_scope)]
-  for chunk in chunked_gv:
-    with tf.name_scope('allreduce'):
-      for grad_and_vars in chunk:
-        reduced_gv_list.append(sum_grad_and_var_all_reduce(
-            single_session,
-            grad_and_vars, num_workers, alg, gpu_indices,
-            (aux_devices if is_hierarchical
-             else aux_device_groups[group_index]),
-            num_shards))
-        group_index = (group_index + 1) % len(aux_device_groups)
-  new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
-  if packing:
-    new_tower_grads = unpack_small_tensors(new_tower_grads, packing)
-  return new_tower_grads
-def extract_ranges(index_list, range_size_limit=32):
-  """Extract consecutive ranges and singles from index_list.
-  Args:
-    index_list: List of monotone increasing non-negative integers.
-    range_size_limit: Largest size range to return.  If a larger
-      consecutive range exists it will be returned as multiple
-      ranges.
-  Returns:
-   ranges, singles where ranges is a list of [first, last] pairs of
-     consecutive elements in index_list, and singles is all of the
-     other elements, in original order.
-  """
-  if not index_list:
-    return [], []
-  first = index_list[0]
-  last = first
-  ranges = []
-  singles = []
-  for i in index_list[1:]:
-    if i == last + 1 and (last - first) <= range_size_limit:
-      last = i
-    else:
-      if last > first:
-        ranges.append([first, last])
-      else:
-        singles.append(first)
-      first = i
-      last = i
-  if last > first:
-    ranges.append([first, last])
-  else:
-    singles.append(first)
-  return ranges, singles
-GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
-def pack_range(key, packing, grad_vars, rng):
-  """Form the concatenation of a specified range of gradient tensors.
-  Args:
-    key: Value under which to store meta-data in packing that will be used
-      later to restore the grad_var list structure.
-    packing: Dict holding data describing packed ranges of small tensors.
-    grad_vars: List of (grad, var) pairs for one tower.
-    rng: A pair of integers giving the first, last indices of a consecutive
-      range of tensors to be packed.
-  Returns:
-    A tensor that is the concatenation of all the specified small tensors.
-  """
-  to_pack = grad_vars[rng[0]:rng[1] + 1]
-  members = []
-  variables = []
-  restore_shapes = []
-  with tf.name_scope('pack'):
-    for g, v in to_pack:
-      variables.append(v)
-      restore_shapes.append(g.shape)
-      with tf.device(g.device):
-        members.append(tf.reshape(g, [-1]))
-    packing[key] = GradPackTuple(
-        indices=range(rng[0], rng[1] + 1),
-        vars=variables,
-        shapes=restore_shapes)
-    with tf.device(members[0].device):
-      return tf.concat(members, 0)
-def unpack_grad_tuple(gv, gpt):
-  """Unpack a previously packed collection of gradient tensors.
-  Args:
-    gv: A (grad, var) pair to be unpacked.
-    gpt: A GradPackTuple describing the packing operation that produced gv.
-  Returns:
-    A list of (grad, var) pairs corresponding to the values that were
-     originally packed into gv, maybe following subsequent operations like
-     reduction.
-  """
-  elt_widths = [x.num_elements() for x in gpt.shapes]
-  with tf.device(gv[0][0].device):
-    with tf.name_scope('unpack'):
-      splits = tf.split(gv[0], elt_widths)
-      unpacked_gv = []
-      for idx, s in enumerate(splits):
-        unpacked_gv.append((tf.reshape(s, gpt.shapes[idx]), gpt.vars[idx]))
-  return unpacked_gv
-def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
-  """Concatenate small gradient tensors together for reduction.
-  Args:
-    tower_grads: List of lists of (gradient, variable) tuples.
-    max_bytes: Int giving max number of bytes in a tensor that
-      may be considered small.
-    max_group: Int giving max number of small tensors that may be
-      concatenated into one new tensor.
-  Returns:
-    new_tower_grads, packing where new_tower_grads is identical to
-      tower_grads except that all feasible small_tensors have been removed
-      from their places and concatenated into larger tensors that are
-      now in the front of the list for each tower, and packing contains
-      the data necessary to restore the tower_grads structure.
-  Look through the first tower for gradients of the same type (float),
-  and small size, that are all sequential.  For each such group,
-  replace by a new tensor that is a flattened concatenation.  Note
-  that the corresponding variable will be absent, which doesn't matter
-  because it isn't used during all-reduce.
-  Requires:
-    Every gv_list in towers must have isomorphic structure including identical
-      tensor sizes and types.
-  """
-  small_indices = []
-  large_indices = []
-  for idx, (g, _) in enumerate(tower_grads[0]):
-    if g.dtype == tf.float32 and (4 * g.shape.num_elements()) <= max_bytes:
-      small_indices.append(idx)
-    else:
-      large_indices.append(idx)
-  small_ranges, small_singles = extract_ranges(
-      small_indices, range_size_limit=max_group)
-  large_indices = sorted(large_indices + small_singles)
-  num_gv = len(tower_grads[0])
-  packing = {}
-  if small_ranges:
-    new_tower_grads = []
-    for dev_idx, gv_list in enumerate(tower_grads):
-      assert len(gv_list) == num_gv
-      new_gv_list = []
-      for r in small_ranges:
-        key = '%d:%d' % (dev_idx, len(new_gv_list))
-        new_gv_list.append((pack_range(key, packing, gv_list, r),
-                            'packing_var_placeholder'))
-      for i in large_indices:
-        new_gv_list.append(gv_list[i])
-      new_tower_grads.append(new_gv_list)
-    return new_tower_grads, packing
-  else:
-    return tower_grads, None
-def unpack_small_tensors(tower_grads, packing):
-  """Undo the structure alterations to tower_grads done by pack_small_tensors.
-  Args:
-    tower_grads: List of List of (grad, var) tuples.
-    packing: A dict generated by pack_small_tensors describing the changes
-      it made to tower_grads.
-  Returns:
-    new_tower_grads: identical to tower_grads except that concatentations
-      of small tensors have been split apart and returned to their original
-      positions, paired with their original variables.
-  """
-  if not packing:
-    return tower_grads
-  new_tower_grads = []
-  num_devices = len(tower_grads)
-  num_packed = len(packing.keys()) // num_devices
-  for dev_idx, gv_list in enumerate(tower_grads):
-    new_gv_list = gv_list[num_packed:]
-    for i in xrange(0, num_packed):
-      k = '%d:%d' % (dev_idx, i)
-      gpt = packing[k]
-      gv = unpack_grad_tuple(gv_list[i], gpt)
-      for gi, idx in enumerate(gpt.indices):
-        assert idx == gpt.indices[gi]
-        new_gv_list.insert(idx, gv[gi])
-    new_tower_grads.append(new_gv_list)
-  return new_tower_grads
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/allreduce_test.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/allreduce_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf_cnn_benchmark.allreduce."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import collections as pycoll
-import numpy as np
-import tensorflow.compat.v1 as tf
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-import allreduce
-class AllReduceTest(tf.test.TestCase):
-  def testGroupKey(self):
-    d0 = ['/job:worker/replica:0/task:0/device:GPU:1',
-          '/job:worker/replica:0/task:0/device:GPU:0',
-          '/job:worker/replica:0/task:0/device:GPU:3',]
-    d1 = ['/job:worker/replica:0/task:1/device:GPU:1',
-          '/job:worker/replica:0/task:1/device:GPU:0',
-          '/job:worker/replica:0/task:1/device:GPU:3',]
-    d2 = ['/job:worker/replica:0/task:1/device:GPU:1',
-          '/job:worker/replica:0/task:1/device:GPU:3',
-          '/job:worker/replica:0/task:1/device:GPU:0',]
-    d3 = ['/job:worker/replica:0/task:1/device:GPU:1',
-          '/job:worker/replica:0/task:1/device:GPU:3',
-          '/job:worker/replica:0/task:1/device:GPU:2',]
-    d4 = ['/job:worker/task:0/device:GPU:1',
-          '/job:worker/task:0/device:GPU:2',
-          '/job:worker/task:0/device:GPU:3',]
-    d5 = ['/job:worker/task:0/device:CPU:1',
-          '/job:worker/task:0/device:CPU:2']
-    d6 = ['/job:worker/task:0/device:CPU:2',
-          '/job:worker/task:0/device:CPU:1']
-    g0 = allreduce.collective_group_key(d0)
-    g1 = allreduce.collective_group_key(d1)
-    g2 = allreduce.collective_group_key(d2)
-    g3 = allreduce.collective_group_key(d3)
-    g4 = allreduce.collective_group_key(d4)
-    g5 = allreduce.collective_group_key(d5)
-    g6 = allreduce.collective_group_key(d6)
-    self.assertEqual(g0, g1)
-    self.assertEqual(g0, g2)
-    self.assertTrue(g0 != g3)
-    self.assertEqual(g3, g4)
-    self.assertEqual(g5, g6)
-    self.assertTrue(g4 != g5)
-  def testExtractRanges(self):
-    x = []
-    expected_ranges = []
-    expected_singles = []
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-    x = [1, 3, 4, 6, 7, 8, 9]
-    expected_ranges = [[3, 4], [6, 9]]
-    expected_singles = [1]
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-    x = [1, 2, 3, 4, 6, 7, 8, 9]
-    expected_ranges = [[1, 4], [6, 9]]
-    expected_singles = []
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-    x = [1, 3, 4, 6, 7, 9]
-    expected_ranges = [[3, 4], [6, 7]]
-    expected_singles = [1, 9]
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-    x = [1, 3, 6, 9]
-    expected_ranges = []
-    expected_singles = [1, 3, 6, 9]
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-  def testPackRange(self):
-    packing = {}
-    t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
-    t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
-    gv = [(t0, 'v0'), (t1, 'v1')]
-    new_t = allreduce.pack_range('0:0', packing, gv, [0, 1])
-    self.assertEqual(1, new_t.shape.ndims)
-    self.assertEqual(8, new_t.shape.dims[0])
-    self.assertEqual(
-        packing, {
-            '0:0':
-                allreduce.GradPackTuple(
-                    indices=range(2),
-                    vars=['v0', 'v1'],
-                    shapes=[tf.TensorShape([4]),
-                            tf.TensorShape([4])])
-        })
-    t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
-    t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
-    gv = [(t0, 'v0'), (t1, 'v1'), (t2, 'v2'), (t3, 'v3')]
-    packing = {}
-    new_t = allreduce.pack_range('1:0', packing, gv, [0, 3])
-    self.assertEqual(1, new_t.shape.ndims)
-    self.assertEqual(26, new_t.shape.dims[0])
-    self.assertEqual(
-        packing, {
-            '1:0':
-                allreduce.GradPackTuple(
-                    indices=range(4),
-                    vars=['v0', 'v1', 'v2', 'v3'],
-                    shapes=[
-                        tf.TensorShape([4]),
-                        tf.TensorShape([4]),
-                        tf.TensorShape([3, 3]),
-                        tf.TensorShape([3, 3])
-                    ])
-        })
-  def testUnpackGradTuple(self):
-    packing = {
-        '0:0':
-            allreduce.GradPackTuple(
-                indices=range(4),
-                vars=['v0', 'v1', 'v2', 'v3'],
-                shapes=[
-                    tf.TensorShape([4]),
-                    tf.TensorShape([4]),
-                    tf.TensorShape([3, 3]),
-                    tf.TensorShape([3, 3])
-                ])
-    }
-    tc = tf.constant([0, 1, 2, 3, 4, 5, 6, 7,
-                      0, 1, 2, 3, 4, 5, 6, 7, 8,
-                      0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
-    packed_gv = [tc, 'packing_var_placeholder']
-    gv = allreduce.unpack_grad_tuple(packed_gv, packing['0:0'])
-    self.assertEqual(4, len(gv))
-    self.assertEqual('v0', gv[0][1])
-    self.assertEqual('v1', gv[1][1])
-    self.assertEqual('v2', gv[2][1])
-    self.assertEqual('v3', gv[3][1])
-    self.assertEqual(1, gv[0][0].shape.ndims)
-    self.assertEqual(4, gv[0][0].shape.dims[0])
-    self.assertEqual(1, gv[1][0].shape.ndims)
-    self.assertEqual(4, gv[1][0].shape.dims[0])
-    self.assertEqual(2, gv[2][0].shape.ndims)
-    self.assertEqual(3, gv[2][0].shape.dims[0])
-    self.assertEqual(3, gv[2][0].shape.dims[1])
-  def testPackSmallTensors(self):
-    t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
-    t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
-    t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
-    t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
-    tower_grads = []
-    for d in range(0, 3):
-      gv = [(t0, 'v_%d_0' % d), (t1, 'v_%d_1' %d), (t2, 'v_%d_2' %d),
-            (t3, 'v_%d_3' % d)]
-      tower_grads.append(gv)
-    # 1) Set the size limit so small that nothing gets concatenated.
-    new_tower_grads, packing = allreduce.pack_small_tensors(
-        tower_grads, max_bytes=12,
-        max_group=10)
-    self.assertEqual(tower_grads, new_tower_grads)
-    self.assertTrue(packing is None)
-    # 2) Set the size limit so only the first two tensors get concatenated
-    new_tower_grads, packing = allreduce.pack_small_tensors(
-        tower_grads, max_bytes=16,  # 16 bytes == 4 elements
-        max_group=10)
-    self.assertEqual(3, len(new_tower_grads))
-    self.assertEqual(4, len(tower_grads[0]))
-    first_tower = new_tower_grads[0]
-    self.assertEqual(3, len(first_tower))
-    self.assertEqual(1, first_tower[0][0].shape.ndims)
-    self.assertEqual(8, first_tower[0][0].shape.dims[0])
-    self.assertEqual(packing,
-                     {'0:0': allreduce.GradPackTuple(
-                         indices=range(2),
-                         vars=['v_0_0', 'v_0_1'],
-                         shapes=[tf.TensorShape([4]),
-                                 tf.TensorShape([4])]),
-                      '1:0': allreduce.GradPackTuple(
-                          indices=range(2),
-                          vars=['v_1_0', 'v_1_1'],
-                          shapes=[tf.TensorShape([4]),
-                                  tf.TensorShape([4])]),
-                      '2:0': allreduce.GradPackTuple(
-                          indices=range(2),
-                          vars=['v_2_0', 'v_2_1'],
-                          shapes=[tf.TensorShape([4]),
-                                  tf.TensorShape([4])])})
-    # 3) Set the size limit so all tensors get concatenated
-    new_tower_grads, packing = allreduce.pack_small_tensors(
-        tower_grads, max_bytes=256,   # bytes = 64 elements
-        max_group=10)
-    self.assertEqual(3, len(new_tower_grads))
-    self.assertEqual(4, len(tower_grads[0]))
-    self.assertEqual(1, len(new_tower_grads[0]))
-    first_tower = new_tower_grads[0]
-    self.assertEqual(1, first_tower[0][0].shape.ndims)
-    self.assertEqual(26, first_tower[0][0].shape.dims[0])
-    self.assertEqual(packing,
-                     {'0:0': allreduce.GradPackTuple(
-                         indices=range(4),
-                         vars=['v_0_0', 'v_0_1', 'v_0_2', 'v_0_3'],
-                         shapes=[tf.TensorShape([4]),
-                                 tf.TensorShape([4]),
-                                 tf.TensorShape([3, 3,]),
-                                 tf.TensorShape([3, 3,])]),
-                      '1:0': allreduce.GradPackTuple(
-                          indices=range(4),
-                          vars=['v_1_0', 'v_1_1', 'v_1_2', 'v_1_3'],
-                          shapes=[tf.TensorShape([4]),
-                                  tf.TensorShape([4]),
-                                  tf.TensorShape([3, 3,]),
-                                  tf.TensorShape([3, 3,])]),
-                      '2:0': allreduce.GradPackTuple(
-                          indices=range(4),
-                          vars=['v_2_0', 'v_2_1', 'v_2_2', 'v_2_3'],
-                          shapes=[tf.TensorShape([4]),
-                                  tf.TensorShape([4]),
-                                  tf.TensorShape([3, 3,]),
-                                  tf.TensorShape([3, 3,])])})
-  def testUnpackSmallTensors(self):
-    packing = {'0:0': allreduce.GradPackTuple(indices=range(2),
-                                              vars=['v_0_0', 'v_0_1'],
-                                              shapes=[tf.TensorShape([4]),
-                                                      tf.TensorShape([4])]),
-               '0:1': allreduce.GradPackTuple(indices=range(3, 5),
-                                              vars=['v_0_3', 'v_0_4'],
-                                              shapes=[tf.TensorShape([3, 3,]),
-                                                      tf.TensorShape([3, 3,])]),
-               '1:0': allreduce.GradPackTuple(indices=range(2),
-                                              vars=['v_1_0', 'v_1_1'],
-                                              shapes=[tf.TensorShape([4]),
-                                                      tf.TensorShape([4])]),
-               '1:1': allreduce.GradPackTuple(indices=range(3, 5),
-                                              vars=['v_1_3', 'v_1_4'],
-                                              shapes=[tf.TensorShape([3, 3,]),
-                                                      tf.TensorShape([3, 3,])])}
-    t0 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7], dtype=tf.float32)
-    t1 = tf.constant([17, 17], dtype=tf.float32)
-    t2 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7, 8,
-                      0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
-    t3 = tf.constant([0], dtype=tf.float32)
-    tower_grads = []
-    for d in range(0, 2):
-      one_tower = [(t0, 'packing_var_placeholder'),
-                   (t2, 'packing_var_placeholder'),
-                   (t1, 'v_%d_2' % d), (t3, 'v_%d_5' %d)]
-      tower_grads.append(one_tower)
-    new_tower_grads = allreduce.unpack_small_tensors(tower_grads, packing)
-    self.assertEqual(2, len(new_tower_grads))
-    for d, tg in enumerate(new_tower_grads):
-      self.assertEqual(6, len(tg))
-      self.assertEqual('v_%d_0' % d, tg[0][1])
-      self.assertEqual('v_%d_1' % d, tg[1][1])
-      self.assertEqual('v_%d_2' % d, tg[2][1])
-      self.assertEqual('v_%d_3' % d, tg[3][1])
-      self.assertEqual('v_%d_4' % d, tg[4][1])
-      self.assertEqual('v_%d_5' % d, tg[5][1])
-      self.assertEqual(1, tg[0][0].shape.ndims)
-      self.assertEqual(4, tg[0][0].shape.dims[0])
-      self.assertEqual(1, tg[1][0].shape.ndims)
-      self.assertEqual(4, tg[1][0].shape.dims[0])
-      self.assertEqual(1, tg[2][0].shape.ndims)
-      self.assertEqual(2, tg[2][0].shape.dims[0])
-      self.assertEqual(2, tg[3][0].shape.ndims)
-      self.assertEqual(3, tg[3][0].shape.dims[0])
-      self.assertEqual(3, tg[3][0].shape.dims[1])
-      self.assertEqual(2, tg[4][0].shape.ndims)
-      self.assertEqual(3, tg[4][0].shape.dims[0])
-      self.assertEqual(3, tg[4][0].shape.dims[1])
-      self.assertEqual(1, tg[5][0].shape.ndims)
-      self.assertEqual(1, tg[5][0].shape.dims[0])
-class DynamicPackingTest(test_util.TensorFlowTestCase):
-  """Packing/Unpacking tests that require executing a TensorFlow session."""
-  def _init_tensors(self, num_towers, tensor_shapes):
-    """Construct a collection of tensors across multiple devices."""
-    num_tensors = len(tensor_shapes)
-    consts = []
-    tensors = []
-    vrbls = []
-    tower_grads = []
-    tf.Variable([-1], dtype=tf.int32, name='packing_var_placeholder')
-    for dev_idx in range(0, num_towers):
-      devname = '/job:localhost/device:GPU:%d' % dev_idx
-      consts.append([])
-      tensors.append([])
-      vrbls.append([])
-      with tf.device(devname):
-        base_value = 0
-        gv_tuples = []
-        for t_idx in range(0, num_tensors):
-          shape = tensor_shapes[t_idx]
-          num_elts = 0
-          for d in shape:
-            num_elts = (num_elts or 1) * d
-          c = np.fromiter(range(base_value, base_value + num_elts),
-                          dtype=np.float32).reshape(shape)
-          base_value += num_elts
-          consts[dev_idx].append(c)
-          tensors[dev_idx].append(tf.constant(c))
-          vrbls[dev_idx].append(
-              tf.Variable(c, name='v_d%d_t%d' % (dev_idx, t_idx)))
-          gv_tuples.append((tensors[dev_idx][-1], vrbls[dev_idx][-1]))
-        tower_grads.append(gv_tuples)
-    return tower_grads, consts, tensors, vrbls
-  _test_tuple = pycoll.namedtuple('_test_tuple',
-                                  'num_devices, in_shapes out_shapes out_i')
-  def _do_pack_unpack_test(self, tt):
-    """Do a single pack-unpack test.
-    Args:
-      tt: A _test_tuple defining the parameters of the test to do.
-    This test executes a graph that performs a pack of tower_grads
-    followed by an unpack and verifies that the shapes and values
-    of gradient tensors are unchanged, along with paired variables.
-    """
-    with ops.Graph().as_default():
-      tower_grads, consts, _, vrbls = self._init_tensors(
-          tt.num_devices, tt.in_shapes)
-      packed_tg, packing = allreduce.pack_small_tensors(
-          tower_grads, max_bytes=40, max_group=10)
-      unpacked_tg = allreduce.unpack_small_tensors(packed_tg, packing)
-      with self.test_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        packed = sess.run(packed_tg)
-        for d in range(0, tt.num_devices):
-          for t in range(0, len(tt.out_shapes)):
-            num_elts = 0
-            for dim in tt.out_shapes[t]:
-              num_elts = (num_elts or 1) * dim
-            self.assertTrue(np.array_equal(
-                np.array(range(tt.out_i[t], tt.out_i[t] + num_elts),
-                         dtype=np.float32).reshape(tt.out_shapes[t]),
-                packed[d][t][0]))
-        unpacked = sess.run(unpacked_tg)
-        for d in range(0, tt.num_devices):
-          for t in range(0, len(tt.in_shapes)):
-            self.assertTrue(np.array_equal(consts[d][t], unpacked[d][t][0]))
-            self.assertEqual(vrbls[d][t], unpacked_tg[d][t][1])
-  def testPackUnpack0(self):
-    self._do_pack_unpack_test(
-        self._test_tuple(num_devices=3,
-                         in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
-                         out_shapes=[[17], [12], [5, 5, 5]],
-                         out_i=[0, 17, 29]))
-  def testPackUnpack1(self):
-    self._do_pack_unpack_test(
-        self._test_tuple(num_devices=4,
-                         in_shapes=[[5, 5, 5], [2, 3], [5]],
-                         out_shapes=[[11], [5, 5, 5]],
-                         out_i=[125, 0]))
-  def testPackUnpack2(self):
-    self._do_pack_unpack_test(
-        self._test_tuple(num_devices=2,
-                         in_shapes=[[5, 5, 5], [2, 3], [1, 5], [7], [100]],
-                         out_shapes=[[18], [5, 5, 5], [100]],
-                         out_i=[125, 0, 143]))
-  def _do_all_reduce_pack_test(self, tt):
-    """Test that all-reduce results are the same with or without packing."""
-    with ops.Graph().as_default():
-      tower_grads, consts, _, _ = self._init_tensors(
-          tt.num_devices, tt.in_shapes)
-      dev_prefixes = ['/job:localhost']
-      num_workers = 1
-      alg = 'xring'
-      shards = 1
-      single_session = True
-      gpu_indices = range(0, tt.num_devices)
-      assert len(gpu_indices) == len(tower_grads)
-      no_pack_all_reduce = allreduce.sum_gradients_all_reduce(
-          single_session,
-          dev_prefixes, tower_grads, num_workers, alg, shards,
-          gpu_indices,
-          agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
-      packed_tg, packing = allreduce.pack_small_tensors(tower_grads, 100, 100)
-      packed_all_reduce = allreduce.sum_gradients_all_reduce(
-          single_session,
-          dev_prefixes, packed_tg, num_workers, alg, shards,
-          gpu_indices,
-          agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
-      unpacked_tg = allreduce.unpack_small_tensors(packed_all_reduce, packing)
-      with self.test_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        no_pack_values = sess.run(no_pack_all_reduce)
-        pack_unpack_values = sess.run(unpacked_tg)
-        for d in range(1, tt.num_devices):
-          for t in range(0, len(tt.in_shapes)):
-            self.assertTrue(np.allclose(no_pack_values[d][t][0],
-                                        tt.num_devices * consts[0][t]))
-            self.assertTrue(np.array_equal(no_pack_values[d][t][0],
-                                           pack_unpack_values[d][t][0]))
-  def testAllReducePacked0(self):
-    self._do_all_reduce_pack_test(
-        self._test_tuple(num_devices=3,
-                         in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
-                         out_shapes=[[17], [12], [5, 5, 5]],
-                         out_i=[0, 17, 29]))
-  def testAllReducePacked1(self):
-    self._do_all_reduce_pack_test(
-        self._test_tuple(num_devices=2,
-                         in_shapes=[[8], [3, 3], [12], [5, 5, 5], [3], [4]],
-                         out_shapes=[[17], [7], [12], [5, 5, 5]],
-                         out_i=[0, 17, 29, 154, 157]))
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  tf.test.main()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/batch_allreduce.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/batch_allreduce.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Contains classes and functions for doing a single-machine batch all-reduce.
-An all-reduce is taking the reduction (typically a sum) of a list of tensors,
-each on a different device. The result must end up back on each device, which is
-where the word "all" comes from. In summary, each device starts with a single
-tensor, and ends up with the reduction of all tensors.
-A batch all-reduce is doing several independent all-reduces. When doing a batch
-all-reduce, care is taken to evenly distribute the reduction computations
-across devices and inter-device tensor transfers across device links.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-# TODO(reedwm): Support distributed all-reduces in this file.
-# TODO(reedwm): Merge this code with allreduce.py, which contains some batch
-# all-reduce code that this file calls. allreduce.py also supports distributed
-# batch-reduce while this file only supports single-machine all-reduce.
-import abc
-import six
-import tensorflow.compat.v1 as tf
-from tensorflow.python.ops import data_flow_ops
-import allreduce
-import constants
-def _all_reduce_using_copy(tensors_across_devices, use_mean):
-  """Does an all-reduce of a list of tensors by copying to the current device.
-  The tensors are copied to the current device and then reduced.
-  Args:
-    tensors_across_devices: A list of tensors, each on a different device.
-    use_mean: Whether to take the mean of the tensors instead of a sum:
-  Returns:
-    A reduced tensor on the current device.
-  """
-  reduced_tensor = tf.add_n(tensors_across_devices)
-  if use_mean:
-    reduced_tensor *= 1 / len(tensors_across_devices)
-  return reduced_tensor
-@six.add_metaclass(abc.ABCMeta)
-class BatchAllReduceAlgorithm(object):
-  """Represents an algorithm for performing a batch all-reduce operation."""
-  def batch_all_reduce(self,
-                       all_device_tensors,
-                       num_splits,
-                       compact_tensors,
-                       defer_tensors,
-                       xla_compile=False):
-    """Performs a batch all-reduce.
-    The reduction done is a sum.
-    `all_device_tensors` is a list of list of tensors that will be batch
-    all-reduced. All tensors within a single inner list must be on the same
-    device. The nth element in each list, for any n, will be reduced together.
-    The return value is in the same form as `all_device_tensors`, except that
-    each tensor is reduced.
-    For example, if `all_device_tensors` is:
-    [[ A,  B  ],     # A and B are on GPU 0
-     [ C,  D  ]]     # C and D are on GPU 1
-    Then the return value will be:
-    [[ A+C,  B+D ],  # These two tensors are on GPU 0
-     [ A+C,  B+D ]]  # These two tensors are on GPU 1
-    Arguments:
-      all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
-        is a tensor where `i` is the device index and `j` is the tensor index.
-      num_splits: If not None, tensors will be concatenated and split into this
-        many pieces during the all-reduce, then split back into their original
-        shapes afterwards. Has no impact on correctness and can improve
-        performance. Requires all tensors to be the same type.
-      compact_tensors: If True, tensors are casted to fp16 before being all-
-        reduced. Improves performance, but hurts numerical stability.
-      defer_tensors: If True, every time the return value
-        `reduced_all_device_tensors` is evaluated, the result will be the
-        reduced tensors values of `all_device_tensors` from the previous session
-        run instead of the current session run, or zero on the first session
-        run. This can improve performance. When training neural networks,
-        deferring gradients often does not harm training, so this can be used to
-        improve performance.
-      xla_compile: If True, use XLA to compile gradients packing and unpacking
-        ops.
-    Returns:
-      reduced_all_device_tensors: A list in the same form as
-        `all_device_tensors`, except each tensor has been reduced.
-      warmup_ops: A list of ops needed to be run once before the all-reduce can
-        occur.
-    """
-    # Before all-reducing tensors, we do several preprocessing functions that
-    # can speed up the all-reduce. We undo these functions after all-reducing
-    # the tensors.
-    # all_device_packed_tensors is a 2-d list of tensors indexed by
-    # [device_id][tensor_id], holding packed tensors from all devices involved
-    # in all-reduce.
-    all_device_packed_tensors = []
-    # all_device_warmup_ops is a 2-d list of ops indexed by
-    # [device_id][tensor_id], holding warmup_ops that need to be run once before
-    # all-reduce can occur.
-    all_device_warmup_ops = []
-    # all_device_put_ops is a 2-d list of ops indexed by
-    # [device_id][tensor_id], holding put ops for deferred tensors. They will be
-    # called in each all-reduce step automatically due to control dependency.
-    all_device_put_ops = []
-    # packers is a list of _TensorPacker, one for each device involved in
-    # all-reduce.
-    packers = [
-        _TensorPacker(num_splits, compact_tensors) for _ in all_device_tensors
-    ]
-    for packer, device_tensors in zip(packers, all_device_tensors):
-      def pack_single_device_tensors(packer=packer,
-                                     device_tensors=device_tensors):
-        """Pack gradient tensors of a device."""
-        packed_tensors = packer.maybe_concat_tensors(device_tensors)
-        packed_tensors = packer.maybe_compact_tensors(packed_tensors)
-        # When xla_compile=False, defer tensors after concat for better
-        # performance.
-        if defer_tensors and not xla_compile:
-          packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
-              packed_tensors)
-          all_device_put_ops.append(put_ops)
-          all_device_warmup_ops.append(warmup_ops)
-        packed_tensors = packer.maybe_split_tensors(packed_tensors)
-        return packed_tensors
-      with tf.device(device_tensors[0].device):
-        if xla_compile:
-          packed_tensors = tf.xla.experimental.compile(
-              pack_single_device_tensors)
-          # When xla_compile=True, intermediate tensors in packing process are
-          # not materialized. Thus, we defer tensors after packing process is
-          # completed instead of in the middle of it.
-          if defer_tensors:
-            packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
-                packed_tensors)
-            all_device_put_ops.append(put_ops)
-            all_device_warmup_ops.append(warmup_ops)
-        else:
-          packed_tensors = pack_single_device_tensors()
-      all_device_packed_tensors.append(packed_tensors)
-    # Perform all-reduce on packed tensors.
-    all_device_tensors = self._do_batch_all_reduce(all_device_packed_tensors)
-    all_device_unpacked_tensors = []
-    for packer, device_tensors in zip(packers, all_device_tensors):
-      def unpack_single_device_tensors(packer=packer,
-                                       device_tensors=device_tensors):
-        """Unpack gradient tensors of a device."""
-        unpacked_tensors = packer.undo_maybe_split_tensors(device_tensors)
-        unpacked_tensors = packer.undo_maybe_compact_tensors(unpacked_tensors)
-        unpacked_tensors = packer.undo_maybe_concat_tensors(unpacked_tensors)
-        return unpacked_tensors
-      with tf.device(device_tensors[0].device):
-        if xla_compile:
-          unpacked_device_tensor = tf.xla.experimental.compile(
-              unpack_single_device_tensors)
-        else:
-          unpacked_device_tensor = unpack_single_device_tensors()
-      all_device_unpacked_tensors.append(unpacked_device_tensor)
-    # Note: There is no undo operation for deferring tensors. But we do need to
-    # call _add_put_op_control_deps at the end if we deferred the tensors.
-    if defer_tensors:
-      all_device_unpacked_tensors = _add_put_op_control_deps(
-          all_device_unpacked_tensors, num_splits, all_device_put_ops)
-    return all_device_unpacked_tensors, all_device_warmup_ops
-  @abc.abstractmethod
-  def _do_batch_all_reduce(self, all_device_tensors):
-    """Performs a batch all-reduce.
-    Unlike `self.batch_all_reduce`, this does not do any preprocessing of the
-    tensors.
-    Args:
-      all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
-        is a tensor where `i` is the device index and `j` is the tensor index.
-    Returns:
-      reduced_all_device_tensors: A list in the same form as
-        `all_device_tensors`, except each tensor has been reduced.
-    """
-    pass
-class CopyToDeviceAlgorithm(BatchAllReduceAlgorithm):
-  """An algorithm that copies tensors to be reduced to a specific device."""
-  def __init__(self, devices_to_reduce_on, use_mean=False):
-    self._devices = devices_to_reduce_on
-    self._use_mean = use_mean
-  def _do_batch_all_reduce(self, all_device_tensors):
-    reduced_tensors = []
-    for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
-      with tf.device(self._devices[i % len(self._devices)]):
-        reduced_tensor = _all_reduce_using_copy(tensors_across_devices,
-                                                self._use_mean)
-        reduced_tensors.append(reduced_tensor)
-    # The tensors will be brought back to each device once they are used.
-    return [reduced_tensors] * len(all_device_tensors)
-class HierarchicalCopyAlgorithm(BatchAllReduceAlgorithm):
-  """An algorithm that uses hierarchical copies. This is only optimized for
-  eight devices connected in NetworkTopology.DGX1 or NetworkTopology.GCP_V100
-  topology.
-  """
-  def __init__(self, network_topology):
-    """Initializer for HierarchicalCopyAlgorithm.
-    Args:
-      network_topology: An instance of Enum class constants.NetworkTopology.
-    """
-    self._network_topology = network_topology
-  def _do_batch_all_reduce(self, all_device_tensors):
-    avail_devices = [device_tensors[0].device
-                     for device_tensors in all_device_tensors]
-    reduced_tensors = []
-    num_devices = len(avail_devices)
-    group_size = num_devices // 2
-    for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
-      group_0_main_device, group_1_main_device = self.__get_main_devices(
-          i, num_devices)
-      if group_0_main_device < group_size:
-        group_0_begin = 0
-        group_1_begin = group_size
-      else:
-        group_0_begin = group_size
-        group_1_begin = 0
-      # Reduce the first group.
-      group_0_tensors = tensors_across_devices[group_0_begin:
-                                               group_0_begin + group_size]
-      with tf.device(avail_devices[group_0_main_device]):
-        group_0_reduced_tensor = _all_reduce_using_copy(group_0_tensors, False)
-      # Reduce the second group.
-      group_1_tensors = tensors_across_devices[group_1_begin:
-                                               group_1_begin + group_size]
-      with tf.device(avail_devices[group_1_main_device]):
-        group_1_reduced_tensor = _all_reduce_using_copy(group_1_tensors, False)
-      # Reduce between the groups.
-      with tf.device(avail_devices[group_0_main_device]):
-        total_reduced_tensor = _all_reduce_using_copy(
-            [group_0_reduced_tensor, group_1_reduced_tensor], False)
-      # Broadcast the result back into the root of each group.
-      with tf.device(avail_devices[group_0_main_device]):
-        group_0_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
-      with tf.device(avail_devices[group_1_main_device]):
-        group_1_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
-      reduced_tensors_bcast = []
-      for j in range(len(tensors_across_devices)):
-        with tf.device(avail_devices[j]):
-          # Broadcast the result back to each member in the group from the root.
-          if (group_0_main_device < group_size) == (j < group_size):
-            src_device_tensor = group_0_reduced_tensor_bcast
-          else:
-            src_device_tensor = group_1_reduced_tensor_bcast
-          reduced_tensors_bcast.append(tf.identity(src_device_tensor))
-      reduced_tensors.append(reduced_tensors_bcast)
-    reduced_tensors = list(zip(*reduced_tensors))
-    return reduced_tensors
-  def __get_main_devices(self, tensor_index, num_devices):
-    """Returns the pair of main devices to use for initial reduction.
-    Args:
-      tensor_index: Index of the current tensor in the list of tensors to copy.
-      num_devices: Total number of devices.
-    Returns:
-      A tuple containing pair of main device indices for the initial
-      reduction. Then, the first element of the tuple should be used for the
-      final reduction.
-    Raises:
-      ValueError: Invalid input arguments.
-    """
-    if self._network_topology == constants.NetworkTopology.DGX1:
-      return tensor_index % num_devices, (tensor_index +
-                                          (num_devices // 2)) % num_devices
-    elif self._network_topology == constants.NetworkTopology.GCP_V100:
-      if num_devices != 8:
-        raise ValueError('HierarchicalCopy only supports eight devices in %s.' %
-                         self._network_topology)
-      # TODO(hinsu): Generalize main device indices to handle any other
-      # isomorphic connection graph that connects two cliques using connections
-      # other than 0-5 and 2-7.
-      main_device_pairs = [(0, 5), (2, 7), (5, 0), (7, 2)]
-      return main_device_pairs[tensor_index % len(main_device_pairs)]
-    else:
-      # TODO(reedwm): make this logic more general for arbitrary topology.
-      raise ValueError(
-          'HierarchicalCopy is not supported for %s network topology.' %
-          self._network_topology)
-class AllReduceSpecAlgorithm(BatchAllReduceAlgorithm):
-  """An algorithm that uses an all reduce spec."""
-  def __init__(self, all_reduce_spec, gpu_indices, agg_small_grads_max_bytes,
-               agg_small_grads_max_group):
-    spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
-    if len(spec) != 1:
-      raise ValueError(
-          'Replicated mode does not support hybrid all-reduce strategies')
-    self._all_reduce_spec = spec[0]
-    self._gpu_indices = gpu_indices
-    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
-    self._agg_small_grads_max_group = agg_small_grads_max_group
-  def _do_batch_all_reduce(self, all_device_tensors):
-    # TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other
-    # gradient aggregation code, since gradient aggregation is doing an all
-    # reduce. Currently, we do gradient repacking in two different places.
-    # TODO(reedwm): Change the allreduce code to reduce tensors instead of
-    # tower_grads.
-    tower_grads = [[(t, None) for t in device_tensors]
-                   for device_tensors in all_device_tensors]
-    aggregated_device_grads = allreduce.sum_gradients_all_reduce(
-        False,  # single_session
-        ['/job:localhost'],
-        tower_grads,
-        1,
-        self._all_reduce_spec.alg,
-        self._all_reduce_spec.shards,
-        self._gpu_indices,
-        agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
-        agg_small_grads_max_group=self._agg_small_grads_max_group)
-    return [[t for t, _ in grad_vars] for grad_vars in aggregated_device_grads]
-def algorithm_from_params(params):
-  """Returns a BatchAllReduceAlgorithm from a Params tuple."""
-  if params.all_reduce_spec:
-    if params.gpu_indices:
-      gpu_indices = [int(x) for x in params.gpu_indices.split(',')]
-    else:
-      gpu_indices = [x for x in range(params.num_gpus)]
-    return AllReduceSpecAlgorithm(params.all_reduce_spec, gpu_indices,
-                                  params.agg_small_grads_max_bytes,
-                                  params.agg_small_grads_max_group)
-  elif params.hierarchical_copy:
-    return HierarchicalCopyAlgorithm(params.network_topology)
-  else:
-    if params.local_parameter_device == 'gpu':
-      devices_to_reduce_on = ['/gpu:%d' % i for i in range(params.num_gpus)]
-    else:
-      devices_to_reduce_on = ['/cpu:0']
-    return CopyToDeviceAlgorithm(devices_to_reduce_on)
-def _apply_to_all_device_tensors(all_device_tensors, apply_func, colocate=True):
-  """Applies a function to each tensor in `all_device_tensors`.
-  A new list of lists of tensors is returned, where every tensor in
-  `all_device_tensors` has had `apply_func` called on it. `all_device_tensors`
-  is not modified.
-  Args:
-    all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
-      a tensor where `i` is the device index and `j` is the tensor index.
-    apply_func: A function taking in three arguments: tensor, device_index,
-      tensor_index, and returning a modified tensor.
-      `tensor` is `all_device_tensors[device_index][tensor_index]`.
-    colocate: If True, apply_func will be run under context manager colocated
-      with it's input tensor.
-  Returns:
-    A list in the same form as `all_device_tensors`, except each tensor has had
-    `apply_func` called on it.
-  """
-  new_all_device_tensors = []
-  for device_index, device_tensors in enumerate(all_device_tensors):
-    new_device_tensors = []
-    for tensor_index, t in enumerate(device_tensors):
-      if colocate:
-        with tf.colocate_with(t):
-          new_t = apply_func(t, device_index, tensor_index)
-      else:
-        new_t = apply_func(t, device_index, tensor_index)
-      new_device_tensors.append(new_t)
-    new_all_device_tensors.append(new_device_tensors)
-  return new_all_device_tensors
-def _defer_tensor(tensor):
-  """Defers the retrieval of a tensor.
-  The tensor is put into a StagingArea, and the return value is the
-  retrieval of the tensor from the StagingArea. The effect is that the
-  tensor returned from this function is the tensor that was put in the
-  StagingArea for the previous Session.run() call.
-  Args:
-    tensor: The tensor to defer for one step.
-  Returns:
-    deferred_tensor: The tensor deferred for one step.
-    put_op: An op to put `tensor` in the StagingArea. Must be run every step
-      that `deferred_tensor` is run.
-    warmup_op: A warmup op that should be called before the first step. Puts
-      a zero tensor into the StagingArea.
-  """
-  tensor_stage = data_flow_ops.StagingArea([tensor.dtype], [tensor.shape])
-  put_op = tensor_stage.put([tensor])
-  warmup_op = tensor_stage.put([tf.zeros(tensor.shape, dtype=tensor.dtype)])
-  # Fetch the next tensor to use.
-  (tensor,) = tensor_stage.get()
-  return tensor, put_op, warmup_op
-def defer_single_device_tensors(device_tensors):
-  """Defer tensors (gradients in this case) from a single device.
-  Arguments:
-    device_tensors: A list of gradients tensors from a single device to defer.
-  Returns:
-    deferred_tensors: A list of tensors deferred for one step.
-    put_ops: A list of ops that put `tensors` in the StagingAreas. Must be run
-      every step that `deferred_tensors` is run.
-    warmup_ops: Warmup ops that should be called before the first step. Puts
-      zero tensors into the StagingArea.
-  """
-  put_ops = []
-  warmup_ops = []
-  deferred_tensors = []
-  for tensor in device_tensors:
-    deferred_tensor, put_op, warmup_op = _defer_tensor(tensor)
-    deferred_tensors.append(deferred_tensor)
-    put_ops.append(put_op)
-    warmup_ops.append(warmup_op)
-  return deferred_tensors, put_ops, warmup_ops
-def _add_put_op_control_deps(all_device_tensors, num_splits, put_ops):
-  """Add control dependencies from `put_ops` to `all_device_tensors`.
-  This should only be called when deferred tensors are being used.
-  The control dependencies are added so that the put ops are run whenever
-  `all_device_tensors` is run. That way, the caller does not have to explicitly
-  run the put ops.
-  Args:
-    all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
-      a tensor where `i` is the device index and `j` is the tensor index.
-    num_splits: The number of splits that were used for the all-reduce.
-    put_ops: A list of put ops from deferring the tensors.
-  Returns:
-    A list in the same form as `all_device_tensors`, except each tensor has a
-    control dependency on an op in `put_ops`.
-  """
-  def apply_func(tensor, device_index, tensor_index):
-    if num_splits == 0:
-      deps = [put_ops[device_index][tensor_index]]
-    else:
-      deps = put_ops[device_index]
-    assert len(deps) == 1
-    with tf.control_dependencies(deps):
-      return tf.identity(tensor, name='control_dependency')
-  return _apply_to_all_device_tensors(all_device_tensors, apply_func)
-class _TensorPacker(object):
-  """Packs and unpacks tensors into groups.
-  This class first concatenates a set of tensors, then split the concatenated
-  tensor into a small number of chunks. This is useful for all-reducing tensors,
-  as doing a small number of all-reduces on large tensors can be faster than
-  doing a large number of all-reduces on small tensors.
-  It also provides option to compact tensors by casting them to fp16, for better
-  all-reduce performance.
-  This class maintains states of processed tensors like shapes and types. So
-  each packer can only be used to pack and unpack one list of tensors. If you
-  need to pack multiple lists of tensors (say from multiple devices), then you
-  need multiple _TensorPacker object, one for each device.
-  """
-  def __init__(self, num_splits, compact):
-    """Initializes the _TensorPacker.
-    Arguments:
-      num_splits: The number of tensors to split the concatenated tensor into.
-        The batch all-reduce will consist of `num_splits` all-reduces. if None
-        or zero, tensors are not split or concatenated.
-      compact: If True, tensors are casted to fp16 during packing and casted
-        back to their original dtypes during unpacking.
-    """
-    self._num_splits = num_splits
-    self._compact = compact
-    self._before_compact_dtypes = []
-  def maybe_concat_tensors(self, device_tensors):
-    """Concatenate tensors into a single tensor."""
-    if not self._num_splits:
-      return device_tensors
-    flat_tensors = [tf.reshape(t, [-1]) for t in device_tensors]
-    self._orig_shapes = [t.shape for t in device_tensors]
-    self._orig_sizes = [s.num_elements() for s in self._orig_shapes]
-    # All shapes must be fully defined.
-    assert None not in self._orig_sizes
-    concatenated_grad = tf.concat(flat_tensors, 0)
-    return [concatenated_grad]
-  def maybe_split_tensors(self, concatenated_tensor):
-    """Split concatenated tensor into `num_splits` pieces."""
-    if not self._num_splits:
-      return concatenated_tensor
-    if len(concatenated_tensor) != 1:
-      raise RuntimeError('tensors must be concatenated via '
-                         'maybe_concat_tensors() before splitting')
-    concatenated_tensor = concatenated_tensor[0]
-    total_tensor_size = concatenated_tensor.shape.num_elements()
-    split_size = total_tensor_size // self._num_splits
-    split_size_last = total_tensor_size - split_size * (self._num_splits - 1)
-    split_sizes = [split_size] * (self._num_splits - 1) + [split_size_last]
-    tensor_packs = tf.split(concatenated_tensor, split_sizes)
-    return tensor_packs
-  def undo_maybe_split_tensors(self, tensor_packs):
-    """Undo maybe_split_tensors()."""
-    if not self._num_splits:
-      return tensor_packs
-    return [tf.concat(tensor_packs, 0)]
-  def undo_maybe_concat_tensors(self, concatenated_tensor):
-    """Undo maybe_concat_tensors()."""
-    if not self._num_splits:
-      return concatenated_tensor
-    if len(concatenated_tensor) != 1:
-      raise RuntimeError(
-          'undo_maybe_split_tensors() must be called before '
-          'undo_maybe_concat_tensors when num_splits is greater than 1')
-    concatenated_tensor = concatenated_tensor[0]
-    tensors_with_sizes = tf.split(concatenated_tensor,
-                                  self._orig_sizes)
-    tensors_with_shapes = [
-        tf.reshape(grad, shape) for grad, shape in zip(
-            tensors_with_sizes, self._orig_shapes)
-    ]
-    return tensors_with_shapes
-  def maybe_compact_tensors(self, device_tensors):
-    """Cast tensors to fp16 and store their original types."""
-    if not self._compact:
-      return device_tensors
-    if self._before_compact_dtypes:
-      raise RuntimeError('maybe_compact_tensors can only be called once.')
-    self._before_compact_dtypes = [t.dtype for t in device_tensors]
-    compact_tensors = [tf.cast(t, tf.float16) for t in device_tensors]
-    return compact_tensors
-  def undo_maybe_compact_tensors(self, compact_tensors):
-    """Undo maybe_compact_tensors()."""
-    if not self._compact:
-      return compact_tensors
-    if not self._before_compact_dtypes:
-      raise RuntimeError('maybe_compact_tensors() must be called before '
-                         'undo_maybe_compact_tensors()')
-    device_tensors = [
-        tf.cast(t, dtype)
-        for t, dtype in zip(compact_tensors, self._before_compact_dtypes)
-    ]
-    return device_tensors
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorFlow benchmark library.
-See the README for more information.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import argparse
-from collections import namedtuple
-import contextlib
-import math
-import multiprocessing
-import os
-import re
-import threading
-import time
-import traceback
-from absl import flags as absl_flags
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow.compat.v1 as tf
-# pylint: disable=g-direct-tensorflow-import
-import cnn_util
-import constants
-import datasets
-import flags
-import mlperf
-import variable_mgr
-import variable_mgr_util
-from cnn_util import log_fn
-from models import model_config
-from platforms import util as platforms_util
-from google.protobuf import text_format
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python import debug as tf_debug
-from tensorflow.python.client import timeline
-from tensorflow.python.framework import graph_util
-from tensorflow.python.framework import graph_util_impl
-from tensorflow.python.framework import importer
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.util import nest
-_DEFAULT_NUM_BATCHES = 100
-# GraphInfo encapsulates the tensors/ops that we care about after building a
-# graph. We use them to benchmark the graph.
-GraphInfo = namedtuple(  # pylint: disable=invalid-name
-    'GraphInfo',
-    [
-        # Ops that produce the input batches (before preprocessing).
-        'input_producer_op',
-        # Ops that adds the preprocessed images to the staging areas
-        'enqueue_ops',
-        # Fetches of sess.run()
-        'fetches',
-        # Op that performs synchronization in distributed mode
-        'execution_barrier',
-        # The global step variable
-        'global_step',
-        # Group of ops that perform per-device initialization work
-        'local_var_init_op_group',
-        # Op to produce summaries
-        'summary_op'
-    ])
-# InputProcessingInfo contains various sources of inputs which will be later fed
-# into the model. If synthetic data is used, all three fields are None.
-InputProcessingInfo = namedtuple(
-    'InputProcessingInfo',
-    [
-        # The first two fields are non-None iff datasets prefetching is not
-        # used.
-        # Ops that produce the input batches.
-        'input_producer_op',
-        # A list of StagingArea for each device.
-        'input_producer_stages',
-        # Input produced using multi device iterator. Non-None iff datasets
-        # prefetching is used
-        'multi_device_iterator_input'
-    ])
-# TODO(reedwm): add upper_bound and lower_bound to appropriate integer and
-# float flags, and change certain string flags to enum flags.
-flags.DEFINE_string('model', 'trivial',
-                    'Name of the model to run, the list of supported models '
-                    'are defined in models/model.py')
-# The code will first check if it's running under benchmarking mode
-# or evaluation mode, depending on 'eval':
-# Under the evaluation mode, this script will read a saved model,
-#   and compute the accuracy of the model against a validation dataset.
-#   Additional ops for accuracy and top_k predictors are only used under
-#   this mode.
-# Under the benchmarking mode, user can specify whether nor not to use
-#   the forward-only option, which will only compute the loss function.
-#   forward-only cannot be enabled with eval at the same time.
-flags.DEFINE_boolean('eval', False, 'whether use eval or benchmarking')
-flags.DEFINE_integer('eval_interval_secs', 0,
-                     'How often to run eval on saved checkpoints. Usually the '
-                     'same as save_model_secs from the corresponding training '
-                     'run. Pass 0 to eval only once.')
-flags.DEFINE_integer('eval_during_training_every_n_steps', None,
-                     'Every n steps during training, pause training, run '
-                     'evaluation, then resume training. Must not be used with '
-                     '--eval, as unlike --eval, this option causes both '
-                     'training and eval to be done. This may take slightly '
-                     'more GPU memory than running just training or evaluation '
-                     'alone. It also may slightly slow down training, even '
-                     'when not taking into account the additional time to '
-                     'evaluate.', lower_bound=1)
-flags.DEFINE_float('eval_during_training_every_n_epochs', None,
-                   'After every n training epochs, pause training, run '
-                   'evaluation, then resume training. See '
-                   '--eval_during_training_every_n_steps for more information.')
-flags.DEFINE_list('eval_during_training_at_specified_steps', [],
-                  'Specify a list of training steps, pause training at each of '
-                  'these steps, run evaluation, then resume training. See '
-                  '--eval_during_training_every_n_steps for more information.')
-flags.DEFINE_list('eval_during_training_at_specified_epochs', [],
-                  'Specify a list of training epochs, pause training after '
-                  'each of these epochs, run evaluation, then resume training. '
-                  'See --eval_during_training_every_n_steps for more '
-                  'information.')
-flags.DEFINE_boolean('forward_only', False,
-                     'whether use forward-only or training for benchmarking')
-flags.DEFINE_boolean('freeze_when_forward_only', False,
-                     'whether to freeze the graph when in forward-only mode.')
-flags.DEFINE_boolean('print_training_accuracy', False,
-                     'whether to calculate and print training accuracy during '
-                     'training')
-flags.DEFINE_integer('batch_size', 0, 'batch size per compute device')
-flags.DEFINE_integer('eval_batch_size', 0, 'eval batch size per compute device')
-flags.DEFINE_integer('batch_group_size', 1,
-                     'number of groups of batches processed in the image '
-                     'producer.')
-flags.DEFINE_integer('num_batches', None, 'number of batches to run, excluding '
-                     'warmup. Defaults to %d' % _DEFAULT_NUM_BATCHES)
-flags.DEFINE_integer('num_eval_batches', None,
-                     'number of eval batches to run, excluding warmup. '
-                     'Defaults to --num_batches')
-flags.DEFINE_float('num_epochs', None,
-                   'number of epochs to run, excluding warmup. '
-                   'This and --num_batches cannot both be specified.')
-flags.DEFINE_float('num_eval_epochs', None,
-                   'number of eval epochs to run, excluding warmup. '
-                   'Defaults to --num_epochs')
-flags.DEFINE_float('stop_at_top_1_accuracy', None,
-                   'If set, stops training after the evaluation accuracy hits '
-                   'this number. Can only be used with one of the '
-                   '--eval_during_training_* flags.')
-flags.DEFINE_boolean('collect_eval_results_async', False,
-                     'If True, start a separate process to postprocess eval '
-                     'results asynchronously. This currently only works with '
-                     'the SSD model.')
-flags.DEFINE_integer('num_warmup_batches', None,
-                     'number of batches to run before timing')
-flags.DEFINE_integer('autotune_threshold', None,
-                     'The autotune threshold for the models')
-# TODO(tucker): change num_gpus to num_devices
-flags.DEFINE_integer('num_gpus', 1, 'the number of GPUs to run on')
-flags.DEFINE_string('gpu_indices', '', 'indices of worker GPUs in ring order')
-flags.DEFINE_integer('display_every', 10,
-                     'Number of local steps after which progress is printed '
-                     'out')
-flags.DEFINE_float('display_perf_ewma', None,
-                   'If set, display numbers of images/sec using exponentially '
-                   'weighted moving avearge with the specified weight, which '
-                   'defines how much current value contributes to the reported '
-                   'average. Increasing weight makes the reported performance '
-                   'number reflect more about the real-time speed instead of '
-                   'the entire history', lower_bound=0, upper_bound=1)
-flags.DEFINE_string('data_dir', None,
-                    'Path to dataset in TFRecord format (aka Example '
-                    'protobufs). If not specified, synthetic data will be '
-                    'used.')
-flags.DEFINE_string('data_name', None,
-                    'Name of dataset: imagenet or cifar10. If not specified, '
-                    'it is automatically guessed based on data_dir.')
-flags.DEFINE_string('resize_method', 'bilinear',
-                    'Method for resizing input images: crop, nearest, '
-                    'bilinear, bicubic, area, or round_robin. The `crop` mode '
-                    'requires source images to be at least as large as the '
-                    'network input size. The `round_robin` mode applies '
-                    'different resize methods based on position in a batch in '
-                    'a round-robin fashion. Other modes support any sizes and '
-                    'apply random bbox distortions before resizing (even with '
-                    'distortions=False).')
-flags.DEFINE_boolean('distortions', False,
-                     'Enable/disable distortions during image preprocessing. '
-                     'These include bbox and color distortions.')
-flags.DEFINE_boolean('use_datasets', True,
-                     'Enable use of datasets for input pipeline')
-flags.DEFINE_string('input_preprocessor', 'default',
-                    'Name of input preprocessor. The list of supported input '
-                    'preprocessors are defined in preprocessing.py.')
-flags.DEFINE_string('gpu_thread_mode', 'gpu_private',
-                    'Methods to assign GPU host work to threads. '
-                    'global: all GPUs and CPUs share the same global threads; '
-                    'gpu_private: a private threadpool for each GPU; '
-                    'gpu_shared: all GPUs share the same threadpool.')
-flags.DEFINE_integer('per_gpu_thread_count', 0,
-                     'The number of threads to use for GPU. Only valid when '
-                     'gpu_thread_mode is not global.')
-flags.DEFINE_boolean('hierarchical_copy', False,
-                     'Use hierarchical copies. Currently only optimized for '
-                     'use on a DGX-1 with 8 GPUs and may perform poorly on '
-                     'other hardware. Requires --num_gpus > 1, and only '
-                     'recommended when --num_gpus=8')
-# TODO(hinsu): Support auto-detection of the network topology while still
-# retaining the ability to specify a particular topology for debugging.
-flags.DEFINE_enum(
-    'network_topology', constants.NetworkTopology.DGX1,
-    (constants.NetworkTopology.DGX1, constants.NetworkTopology.GCP_V100),
-    'Network topology specifies the topology used to connect multiple devices. '
-    'Network topology is used to decide the hierarchy to use for the '
-    'hierarchical_copy.')
-flags.DEFINE_integer('gradient_repacking', 0, 'Use gradient repacking. It'
-                     'currently only works with replicated mode. At the end of'
-                     'of each step, it repacks the gradients for more efficient'
-                     'cross-device transportation. A non-zero value specifies'
-                     'the number of split packs that will be formed.',
-                     lower_bound=0)
-flags.DEFINE_boolean('compact_gradient_transfer', True, 'Compact gradient'
-                     'as much as possible for cross-device transfer and '
-                     'aggregation.')
-flags.DEFINE_enum('variable_consistency', 'strong', ('strong', 'relaxed'),
-                  'The data consistency for trainable variables. With strong '
-                  'consistency, the variable always have the updates from '
-                  'previous step. With relaxed consistency, all the updates '
-                  'will eventually show up in the variables. Likely one step '
-                  'behind.')
-flags.DEFINE_boolean('datasets_repeat_cached_sample', False,
-                     'Enable use of a special datasets pipeline that reads a '
-                     'single TFRecord into memory and repeats it infinitely '
-                     'many times. The purpose of this flag is to make it '
-                     'possible to write regression tests that are not '
-                     'bottlenecked by CNS throughput. '
-                     'Use datasets_use_caching to cache input data.')
-flags.DEFINE_enum('local_parameter_device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
-                  'Device to use as parameter server: cpu or gpu. For '
-                  'distributed training, it can affect where caching of '
-                  'variables happens.')
-flags.DEFINE_enum('device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
-                  'Device to use for computation: cpu or gpu')
-flags.DEFINE_enum('data_format', 'NCHW', ('NHWC', 'NCHW'),
-                  'Data layout to use: NHWC (TF native) or NCHW (cuDNN '
-                  'native, requires GPU).')
-flags.DEFINE_integer('num_intra_threads', None,
-                     'Number of threads to use for intra-op parallelism. If '
-                     'set to 0, the system will pick an appropriate number. '
-                     'None is the same as 0 except that it disables intra-op '
-                     'parallelism on a GPU.')
-flags.DEFINE_integer('num_inter_threads', 0,
-                     'Number of threads to use for inter-op parallelism. If '
-                     'set to 0, the system will pick an appropriate number.')
-flags.DEFINE_boolean('use_numa_affinity', False,
-                     'Whether to turn on NUMA affinity for CPU devices. '
-                     'This is probably only useful when --device=cpu.')
-flags.DEFINE_string('trace_file', '',
-                    'Enable TensorFlow tracing and write trace to this file.')
-flags.DEFINE_boolean('use_chrome_trace_format', True,
-                     'If True, the trace_file, if specified, will be in a '
-                     'Chrome trace format. If False, then it will be a '
-                     'StepStats raw proto.')
-_NUM_STEPS_TO_PROFILE = 10
-_NUM_OPS_TO_PRINT = 20
-flags.DEFINE_string('tfprof_file', None,
-                    'If specified, write a tfprof ProfileProto to this file. '
-                    'The performance and other aspects of the model can then '
-                    'be analyzed with tfprof. See '
-                    'https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/g3doc/command_line.md '  # pylint: disable=line-too-long
-                    'for more info on how to do this. The first %d steps '
-                    'are profiled. Additionally, the top %d most time '
-                    'consuming ops will be printed.\n'
-                    'Note: profiling with tfprof is very slow, but most of the '
-                    'overhead is spent between steps. So, profiling results '
-                    'are more accurate than the slowdown would suggest.' %
-                    (_NUM_STEPS_TO_PROFILE, _NUM_OPS_TO_PRINT))
-flags.DEFINE_string('graph_file', None,
-                    'Write the model\'s graph definition to this file. '
-                    'Defaults to binary format unless filename ends in "txt".')
-flags.DEFINE_string('partitioned_graph_file_prefix', None,
-                    'If specified, after the graph has been partitioned and '
-                    'optimized, write out each partitioned graph to a file '
-                    'with the given prefix.')
-flags.DEFINE_enum('optimizer', 'sgd', ('momentum', 'sgd', 'rmsprop', 'adam'),
-                  'Optimizer to use')
-flags.DEFINE_float('init_learning_rate', None,
-                   'Initial learning rate for training.')
-flags.DEFINE_string('piecewise_learning_rate_schedule', None,
-                    'Specifies a piecewise learning rate schedule based on the '
-                    'number of epochs. This is the form LR0;E1;LR1;...;En;LRn, '
-                    'where each LRi is a learning rate and each Ei is an epoch '
-                    'indexed from 0. The learning rate is LRi if the '
-                    'E(i-1) <= current_epoch < Ei. For example, if this '
-                    'paramater is 0.3;10;0.2;25;0.1, the learning rate is 0.3 '
-                    'for the first 10 epochs, then is 0.2 for the next 15 '
-                    'epochs, then is 0.1 until training ends.')
-flags.DEFINE_float('num_epochs_per_decay', 0,
-                   'Steps after which learning rate decays. If 0, the learning '
-                   'rate does not decay.')
-flags.DEFINE_float('learning_rate_decay_factor', 0,
-                   'Learning rate decay factor. Decay by this factor every '
-                   '`num_epochs_per_decay` epochs. If 0, learning rate does '
-                   'not decay.')
-flags.DEFINE_float('num_learning_rate_warmup_epochs', 0,
-                   'Slowly increase to the initial learning rate in the first '
-                   'num_learning_rate_warmup_epochs linearly.')
-flags.DEFINE_float('minimum_learning_rate', 0,
-                   'The minimum learning rate. The learning rate will '
-                   'never decay past this value. Requires `learning_rate`, '
-                   '`num_epochs_per_decay` and `learning_rate_decay_factor` to '
-                   'be set.')
-flags.DEFINE_float('resnet_base_lr', None, "Base learning rate at bs=256. Only "
-                   "relevant when training ResNet and utilizing the model's "
-                   "learning rate heuristic (get_learning_rate).")
-flags.DEFINE_float('momentum', 0.9, 'Momentum for training.')
-flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')
-flags.DEFINE_float('rmsprop_momentum', 0.9, 'Momentum in RMSProp.')
-flags.DEFINE_float('rmsprop_epsilon', 1.0, 'Epsilon term for RMSProp.')
-flags.DEFINE_float('adam_beta1', 0.9, 'Beta2 term for the Adam optimizer')
-flags.DEFINE_float('adam_beta2', 0.999, 'Beta2 term for the Adam optimizer')
-flags.DEFINE_float('adam_epsilon', 1e-8, 'Epsilon term for the Adam optimizer')
-flags.DEFINE_float('gradient_clip', None,
-                   'Gradient clipping magnitude. Disabled by default.')
-flags.DEFINE_float('weight_decay', 0.00004,
-                   'Weight decay factor for training.')
-flags.DEFINE_float('gpu_memory_frac_for_testing', 0,
-                   'If non-zero, the fraction of GPU memory that will be used. '
-                   'Useful for testing the benchmark script, as this allows '
-                   'distributed mode to be run on a single machine. For '
-                   'example, if there are two tasks, each can be allocated '
-                   '~40 percent of the memory on a single machine. This is '
-                   'also useful for using unified memory, as this can be set '
-                   'above 1 to oversubscribe the GPU using unified memory.',
-                   lower_bound=0.)
-flags.DEFINE_boolean('use_unified_memory', None,
-                     'If True, allocate unified memory enabling larger models '
-                     'to fit in available device RAM.')
-flags.DEFINE_boolean('timestamped_allocator', False,
-                     'If True marks free BFCAllocator::Chunks with time '
-                     'at which they are freed which can allow more efficient '
-                     'memory allocation in cases like RDMA networking.')
-flags.DEFINE_integer('gpu_kt_max_interval', 0,
-                     'If > 0, the maximum number of GPU Ops that may be queued '
-                     'in a row without also queuing a tracking event.')
-flags.DEFINE_integer('gpu_kt_max_bytes', 0,
-                     'If > 0, the maximum number of bytes '
-                     'of GPU memory that may be allocated by sequential '
-                     'GPU Ops without queuing a tracking event.')
-flags.DEFINE_integer('gpu_kt_max_pending', 0,
-                     'If > 0 no more than this many GPU tracking events may be '
-                     'outstanding at any time.  When this limit is reached '
-                     'launch of additional kernels will stall until an '
-                     'outstanding event completes.')
-flags.DEFINE_boolean('use_tf_layers', True,
-                     'If True, use tf.layers for neural network layers. This '
-                     'should not affect performance or accuracy in any way.')
-flags.DEFINE_integer('tf_random_seed', 1234,
-                     'The TensorFlow random seed. Useful for debugging NaNs, '
-                     'as this can be set to various values to see if the NaNs '
-                     'depend on the seed.')
-flags.DEFINE_string('debugger', None,
-                    'If set, use the TensorFlow debugger. If set to "cli", use '
-                    'the local CLI debugger. Otherwise, this must be in the '
-                    'form hostname:port (e.g., localhost:7007) in which case '
-                    'the experimental TensorBoard debugger will be used')
-flags.DEFINE_boolean('use_python32_barrier', False,
-                     'When on, use threading.Barrier at Python 3.2.')
-flags.DEFINE_boolean('ml_perf', False,
-                     'When True, change how the Imagenet input pipeline works '
-                     'slightly to meet the MLPerf compliance rules. This slows '
-                     'down the input pipeline. Without this option, at the end '
-                     'of the input pipeline, the image is divided by 127.5, '
-                     'then 1.0 is subtracted from it, bringing the image '
-                     'values from [0, 255] to [-1.0, 1.0]. With this option, '
-                     'each of the three channels (red, green, blue) have the '
-                     'average channel value among all image subtracted from '
-                     'it, and no division is done.')
-flags.DEFINE_boolean('datasets_use_prefetch', True,
-                     'Enable use of prefetched datasets for input pipeline. '
-                     'This option is meaningless if use_datasets=False.')
-flags.DEFINE_integer('datasets_prefetch_buffer_size', 1,
-                     'Prefetching op buffer size per compute device.')
-flags.DEFINE_integer('datasets_num_private_threads', None,
-                     'Number of threads for a private threadpool created for '
-                     'all datasets computation. By default, we pick an '
-                     'appropriate number. If set to 0, we use the default '
-                     'tf-Compute threads for dataset operations.')
-flags.DEFINE_boolean('datasets_use_caching', False,
-                     'Cache the compressed input data in memory. This improves '
-                     'the data input performance, at the cost of additional '
-                     'memory.')
-flags.DEFINE_integer('datasets_parallel_interleave_cycle_length', None,
-                     'Number of parallel file readers interleaving input data.')
-flags.DEFINE_boolean('datasets_sloppy_parallel_interleave', False,
-                     'Allow parallel interleave to depart from deterministic '
-                     'ordering, by temporarily skipping over files whose '
-                     'elements are not readily available. This can increase '
-                     'througput in particular in the presence of stragglers.')
-flags.DEFINE_integer('datasets_parallel_interleave_prefetch', None,
-                     'The number of input elements to fetch before they are '
-                     'needed for interleaving.')
-flags.DEFINE_integer(
-    'multi_device_iterator_max_buffer_size', 1,
-    'Configuration parameter for the MultiDeviceIterator that '
-    ' specifies the host side buffer size for each device.')
-# Performance tuning parameters.
-flags.DEFINE_boolean('winograd_nonfused', True,
-                     'Enable/disable using the Winograd non-fused algorithms.')
-flags.DEFINE_boolean(
-    'batchnorm_persistent', True,
-    'Enable/disable using the CUDNN_BATCHNORM_SPATIAL_PERSISTENT '
-    'mode for batchnorm.')
-flags.DEFINE_boolean('sync_on_finish', False,
-                     'Enable/disable whether the devices are synced after each '
-                     'step.')
-flags.DEFINE_boolean('staged_vars', False,
-                     'whether the variables are staged from the main '
-                     'computation')
-flags.DEFINE_boolean('force_gpu_compatible', False,
-                     'whether to enable force_gpu_compatible in GPU_Options')
-flags.DEFINE_boolean('allow_growth', None,
-                     'whether to enable allow_growth in GPU_Options')
-flags.DEFINE_boolean('xla', False, 'whether to enable XLA auto-jit compilation')
-flags.DEFINE_boolean('xla_compile', False,
-                     'Enable xla to compile the graph. Uncompilable ops will '
-                     'result in fatal errors.')
-flags.DEFINE_boolean('fuse_decode_and_crop', True,
-                     'Fuse decode_and_crop for image preprocessing.')
-flags.DEFINE_boolean('distort_color_in_yiq', True,
-                     'Distort color of input images in YIQ space.')
-flags.DEFINE_boolean('enable_optimizations', True,
-                     'Whether to enable grappler and other optimizations.')
-flags.DEFINE_string('rewriter_config', None,
-                    'Config for graph optimizers, described as a '
-                    'RewriterConfig proto buffer.')
-flags.DEFINE_enum('loss_type_to_report', 'total_loss',
-                  ('base_loss', 'total_loss'),
-                  'Which type of loss to output and to write summaries for. '
-                  'The total loss includes L2 loss while the base loss does '
-                  'not. Note that the total loss is always used while '
-                  'computing gradients during training if weight_decay > 0, '
-                  'but explicitly computing the total loss, instead of just '
-                  'computing its gradients, can have a performance impact.')
-flags.DEFINE_boolean('single_l2_loss_op', False,
-                     'If True, instead of using an L2 loss op per variable, '
-                     'concatenate the variables into a single tensor and do a '
-                     'single L2 loss on the concatenated tensor.')
-flags.DEFINE_boolean('use_resource_vars', False,
-                     'Use resource variables instead of normal variables. '
-                     'Resource variables are slower, but this option is useful '
-                     'for debugging their performance.')
-flags.DEFINE_boolean('compute_lr_on_cpu', False,
-                     'If True, do computations related to learning rate on the '
-                     'CPU instead of the GPU. This will significantly improve '
-                     'XLA performance in some cases.')
-flags.DEFINE_boolean('sparse_to_dense_grads', False,
-                     'If True, convert all sparse gradients to dense gradients '
-                     'before passing them to the optimizer to update '
-                     'variables. Only affects models with sparse gradients, '
-                     'which currently is only the NCF model.')
-# Performance tuning specific to MKL.
-flags.DEFINE_boolean('mkl', False, 'If true, set MKL environment variables.')
-flags.DEFINE_integer('kmp_blocktime', 0,
-                     'The time, in milliseconds, that a thread should wait, '
-                     'after completing the execution of a parallel region, '
-                     'before sleeping')
-flags.DEFINE_string('kmp_affinity', 'granularity=fine,verbose,compact,1,0',
-                    'Restricts execution of certain threads (virtual execution '
-                    'units) to a subset of the physical processing units in a '
-                    'multiprocessor computer.')
-flags.DEFINE_integer('kmp_settings', 1,
-                     'If set to 1, MKL settings will be printed.')
-# fp16 parameters. If use_fp16=False, no other fp16 parameters apply.
-flags.DEFINE_boolean('use_fp16', False,
-                     'Use 16-bit floats for certain tensors instead of 32-bit '
-                     'floats. This is currently experimental.')
-# TODO(reedwm): The default loss scale of 128 causes most models to diverge
-# on the second step with synthetic data. Changing the tf.set_random_seed
-# call to tf.set_random_seed(1235) or most other seed values causes the
-# issue not to occur.
-flags.DEFINE_float('fp16_loss_scale', None,
-                   'If fp16 is enabled, the loss is multiplied by this amount '
-                   'right before gradients are computed, then each gradient '
-                   'is divided by this amount. Mathematically, this has no '
-                   'effect, but it helps avoid fp16 underflow. Set to 1 to '
-                   'effectively disable. Ignored during eval.')
-flags.DEFINE_boolean('fp16_vars', False,
-                     'If fp16 is enabled, also use fp16 for variables. If '
-                     'False, the variables are stored in fp32 and casted to '
-                     'fp16 when retrieved.  Recommended to leave as False.')
-flags.DEFINE_boolean('fp16_enable_auto_loss_scale', False,
-                     'If True and use_fp16 is True, automatically adjust the '
-                     'loss scale during training.')
-flags.DEFINE_integer('fp16_inc_loss_scale_every_n', 1000,
-                     'If fp16 is enabled and fp16_enable_auto_loss_scale is '
-                     'True, increase the loss scale every n steps.')
-# The method for managing variables:
-#   parameter_server: variables are stored on a parameter server that holds
-#       the master copy of the variable. In local execution, a local device
-#       acts as the parameter server for each variable; in distributed
-#       execution, the parameter servers are separate processes in the
-#       cluster.
-#       For each step, each tower gets a copy of the variables from the
-#       parameter server, and sends its gradients to the param server.
-#   replicated: each GPU has its own copy of the variables. To apply
-#       gradients, an all_reduce algorithm or or regular cross-device
-#       aggregation is used to replicate the combined gradients to all
-#       towers (depending on all_reduce_spec parameter setting).
-#   independent: each GPU has its own copy of the variables, and gradients
-#       are not shared between towers. This can be used to check performance
-#       when no data is moved between GPUs.
-#   distributed_replicated: Distributed training only. Each GPU has a copy
-#       of the variables, and updates its copy after the parameter servers
-#       are all updated with the gradients from all servers. Only works with
-#       cross_replica_sync=true. Unlike 'replicated', currently never uses
-#       nccl all-reduce for replicating within a server.
-#   distributed_all_reduce: Distributed training where all replicas run
-#       in a single session, using all-reduce to mutally reduce the
-#       gradients.  Uses no parameter servers.  When there is only one
-#       worker, this is the same as replicated.
-#   collective_all_reduce: Distributed training where all replicas run
-#       independepently except for variable initialization and for
-#       gradient reduction which is done via collective all-reduce.
-#       NOTE: collective_all_reduce in conjunction with use_fp16 can
-#       lead to NaNs in some models (resnet50).  TODO(tucker): fix it.
-#   horovod: Distributed training using Horovod library. Runs workers using
-#       an MPI framework (e.g. Open MPI). Each worker runs training on
-#       single GPU, and averages gradients using NCCL or MPI all-reduce.
-#       See https://github.com/uber/horovod for more details.
-flags.DEFINE_enum('variable_update', 'parameter_server',
-                  ('parameter_server', 'replicated', 'distributed_replicated',
-                   'independent', 'distributed_all_reduce',
-                   'collective_all_reduce', 'horovod'),
-                  'The method for managing variables: parameter_server, '
-                  'replicated, distributed_replicated, independent, '
-                  'distributed_all_reduce, collective_all_reduce, horovod')
-flags.DEFINE_string('all_reduce_spec', None,
-                    'A specification of the all_reduce algorithm to be used '
-                    'for reducing gradients.  For more details, see '
-                    'parse_all_reduce_spec in variable_mgr.py.  An '
-                    'all_reduce_spec has BNF form:\n'
-                    'int ::= positive whole number\n'
-                    'g_int ::= int[KkMGT]?\n'
-                    'alg_spec ::= alg | alg#int\n'
-                    'range_spec ::= alg_spec | alg_spec/alg_spec\n'
-                    'spec ::= range_spec | range_spec:g_int:range_spec\n'
-                    'NOTE: not all syntactically correct constructs are '
-                    'supported.\n\n'
-                    'Examples:\n '
-                    '"xring" == use one global ring reduction for all '
-                    'tensors\n'
-                    '"pscpu" == use CPU at worker 0 to reduce all tensors\n'
-                    '"nccl" == use NCCL to locally reduce all tensors.  '
-                    'Limited to 1 worker.\n'
-                    '"nccl/xring" == locally (to one worker) reduce values '
-                    'using NCCL then ring reduce across workers.\n'
-                    '"pscpu:32k:xring" == use pscpu algorithm for tensors of '
-                    'size up to 32kB, then xring for larger tensors.')
-# If variable_update==distributed_all_reduce then it may be advantageous
-# to aggregate small tensors into one prior to reduction.  These parameters
-# control that aggregation.
-flags.DEFINE_integer('agg_small_grads_max_bytes', 0,
-                     'If > 0, try to aggregate tensors of less than this '
-                     'number of bytes prior to all-reduce.')
-flags.DEFINE_integer('agg_small_grads_max_group', 10,
-                     'When aggregating small tensors for all-reduce do not '
-                     'aggregate more than this many into one new tensor.')
-flags.DEFINE_integer('allreduce_merge_scope', 1,
-                     'Establish a name scope around this many '
-                     'gradients prior to creating the all-reduce operations. '
-                     'It may affect the ability of the backend to merge '
-                     'parallel ops.')
-# Distributed training parameters.
-flags.DEFINE_enum('job_name', '', ('ps', 'worker', 'controller', ''),
-                  'One of "ps", "worker", "controller", "".  Empty for local '
-                  'training')
-flags.DEFINE_string('ps_hosts', '', 'Comma-separated list of target hosts')
-flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of target hosts')
-flags.DEFINE_string('controller_host', None, 'optional controller host')
-flags.DEFINE_integer('task_index', 0, 'Index of task within the job')
-flags.DEFINE_string('server_protocol', 'grpc', 'protocol for servers')
-flags.DEFINE_boolean('cross_replica_sync', True, '')
-flags.DEFINE_string('horovod_device', '', 'Device to do Horovod all-reduce on: '
-                    'empty (default), cpu or gpu. Default with utilize GPU if '
-                    'Horovod was compiled with the HOROVOD_GPU_ALLREDUCE '
-                    'option, and CPU otherwise.')
-# Summary and Save & load checkpoints.
-flags.DEFINE_integer('summary_verbosity', 0, 'Verbosity level for summary ops. '
-                     'level 0: disable any summary.\n'
-                     'level 1: small and fast ops, e.g.: learning_rate, '
-                     'total_loss.\n'
-                     'level 2: medium-cost ops, e.g. histogram of all '
-                     'gradients.\n'
-                     'level 3: expensive ops: images and histogram of each '
-                     'gradient.\n')
-flags.DEFINE_integer('save_summaries_steps', 0,
-                     'How often to save summaries for trained models. Pass 0 '
-                     'to disable summaries.')
-flags.DEFINE_integer('save_model_secs', 0,
-                     'How often to save trained models. Pass 0 to disable '
-                     'saving checkpoints every N seconds. A checkpoint is '
-                     'saved after training completes regardless of this '
-                     'option.')
-flags.DEFINE_integer('save_model_steps', None,
-                     'How often to save trained models. If specified, '
-                     'save_model_secs must not be specified.')
-flags.DEFINE_integer('max_ckpts_to_keep', 5,
-                     'Max number of checkpoints to keep.')
-flags.DEFINE_string('train_dir', None,
-                    'Path to session checkpoints. Pass None to disable saving '
-                    'checkpoint at the end.')
-flags.DEFINE_string('eval_dir', '/tmp/tf_cnn_benchmarks/eval',
-                    'Directory where to write eval event logs.')
-flags.DEFINE_string('backbone_model_path', None,
-                    'Path to pretrained backbone model checkpoint. Pass None '
-                    'if not using a backbone model.')
-flags.DEFINE_enum('trt_mode', '', ['', 'FP32', 'FP16', 'INT8'],
-                  'If this is specified in forward_only mode and '
-                  'freeze_when_forward_only is set to True, use TensorRT to '
-                  'optimize the graph before execution.')
-flags.DEFINE_integer('trt_max_workspace_size_bytes', 4 << 30,
-                     'Max workspace size bytes used by the TensorRT optimizer.')
-# Benchmark logging for model garden metric
-flags.DEFINE_string('benchmark_log_dir', None,
-                    'The directory to place the log files containing the '
-                    'results of benchmark. The logs are created by '
-                    'BenchmarkFileLogger. Requires the root of the Tensorflow '
-                    'models repository to be in $PYTHTONPATH.')
-flags.DEFINE_string('benchmark_test_id', None,
-                    'The unique test ID of the benchmark run. It could be the '
-                    'combination of key parameters. It is hardware independent '
-                    'and could be used compare the performance between '
-                    'different test runs. This flag is designed for human '
-                    'consumption, and does not have any impact within the '
-                    'system.')
-platforms_util.define_platform_params()
-class GlobalStepWatcher(threading.Thread):
-  """A helper class for global_step.
-  Polls for changes in the global_step of the model, and finishes when the
-  number of steps for the global run are done.
-  """
-  def __init__(self, sess, global_step_op, start_at_global_step,
-               end_at_global_step):
-    threading.Thread.__init__(self)
-    self.sess = sess
-    self.global_step_op = global_step_op
-    self.start_at_global_step = start_at_global_step
-    self.end_at_global_step = end_at_global_step
-    self.start_time = 0
-    self.start_step = 0
-    self.finish_time = 0
-    self.finish_step = 0
-  def run(self):
-    while self.finish_time == 0:
-      time.sleep(.25)
-      global_step_val, = self.sess.run([self.global_step_op])
-      if self.start_time == 0 and global_step_val >= self.start_at_global_step:
-        # Use tf.logging.info instead of log_fn, since print (which is log_fn)
-        # is not thread safe and may interleave the outputs from two parallel
-        # calls to print, which can break tests.
-        tf.logging.info('Starting real work at step %s at time %s' %
-                        (global_step_val, time.ctime()))
-        self.start_time = time.time()
-        self.start_step = global_step_val
-      if self.finish_time == 0 and global_step_val >= self.end_at_global_step:
-        tf.logging.info('Finishing real work at step %s at time %s' %
-                        (global_step_val, time.ctime()))
-        self.finish_time = time.time()
-        self.finish_step = global_step_val
-  def done(self):
-    return self.finish_time > 0
-  def num_steps(self):
-    return self.finish_step - self.start_step
-  def elapsed_time(self):
-    return self.finish_time - self.start_time
-class CheckpointNotFoundException(Exception):
-  pass
-def create_config_proto(params):
-  """Returns session config proto.
-  Args:
-    params: Params tuple, typically created by make_params or
-            make_params_from_flags.
-  """
-  config = tf.ConfigProto()
-  config.allow_soft_placement = True
-  if params.num_intra_threads is None:
-    if params.device == 'gpu':
-      config.intra_op_parallelism_threads = 1
-  else:
-    config.intra_op_parallelism_threads = params.num_intra_threads
-  config.inter_op_parallelism_threads = params.num_inter_threads
-  config.experimental.collective_group_leader = '/job:worker/replica:0/task:0'
-  config.gpu_options.experimental.collective_ring_order = params.gpu_indices
-  config.gpu_options.force_gpu_compatible = params.force_gpu_compatible
-  config.experimental.use_numa_affinity = params.use_numa_affinity
-  if params.device == 'cpu':
-    # TODO(tucker): change num_gpus to num_devices
-    config.device_count['CPU'] = params.num_gpus
-  if params.allow_growth is not None:
-    config.gpu_options.allow_growth = params.allow_growth
-  if params.gpu_memory_frac_for_testing > 0:
-    config.gpu_options.per_process_gpu_memory_fraction = (
-        params.gpu_memory_frac_for_testing)
-  if params.use_unified_memory:
-    config.gpu_options.experimental.use_unified_memory = (
-        params.use_unified_memory)
-  if params.timestamped_allocator:
-    config.gpu_options.experimental.timestamped_allocator = (
-        params.timestamped_allocator)
-  if params.gpu_kt_max_interval > 0:
-    config.gpu_options.experimental.kernel_tracker_max_interval = (
-        params.gpu_kt_max_interval)
-  if params.gpu_kt_max_bytes > 0:
-    config.gpu_options.experimental.kernel_tracker_max_bytes = (
-        params.gpu_kt_max_bytes)
-  if params.gpu_kt_max_pending > 0:
-    config.gpu_options.experimental.kernel_tracker_max_pending = (
-        params.gpu_kt_max_pending)
-  if params.xla:
-    config.graph_options.optimizer_options.global_jit_level = (
-        tf.OptimizerOptions.ON_1)
-  if params.rewriter_config:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
-    text_format.Merge(params.rewriter_config, rewriter_config)
-    config.graph_options.rewrite_options.CopyFrom(rewriter_config)
-  elif not params.enable_optimizations:
-    config.graph_options.optimizer_options.opt_level = tf.OptimizerOptions.L0
-    config.graph_options.rewrite_options.disable_meta_optimizer = True
-  elif params.variable_update == 'collective_all_reduce':
-    rewrite_options = config.graph_options.rewrite_options
-    rewrite_options.scoped_allocator_optimization = (
-        rewriter_config_pb2.RewriterConfig.ON)
-    rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
-  if params.variable_update == 'horovod':
-    import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-    config.gpu_options.visible_device_list = str(hvd.local_rank())
-  # For collective_all_reduce, ignore all devices except current worker.
-  if params.variable_update == 'collective_all_reduce':
-    del config.device_filters[:]
-    config.device_filters.append(
-        '/job:%s/replica:0/task:%d' % (params.job_name, params.task_index))
-  # TODO(b/117324590): Re-enable PinToHostOptimizer when b/117324590 is fixed.
-  # Currently we have to disable PinToHostOptimizer w/ XLA since it causes
-  # OOM/perf cliffs.
-  config.graph_options.rewrite_options.pin_to_host_optimization = (
-      rewriter_config_pb2.RewriterConfig.OFF)
-  return config
-def get_mode_from_params(params):
-  """Returns the mode in which this script is running.
-  Args:
-    params: Params tuple, typically created by make_params or
-            make_params_from_flags.
-  Raises:
-    ValueError: Unsupported params settings.
-  """
-  if params.forward_only and params.eval:
-    raise ValueError('Only one of forward_only and eval parameters is true')
-  if params.eval:
-    return constants.BenchmarkMode.EVAL
-  elif params.forward_only:
-    return constants.BenchmarkMode.FORWARD_ONLY
-  elif (params.eval_during_training_every_n_steps or
-        params.eval_during_training_every_n_epochs or
-        params.eval_during_training_at_specified_steps or
-        params.eval_during_training_at_specified_epochs):
-    return constants.BenchmarkMode.TRAIN_AND_EVAL
-  else:
-    return constants.BenchmarkMode.TRAIN
-# How many digits to show for the loss and accuracies during training.
-LOSS_AND_ACCURACY_DIGITS_TO_SHOW = 3
-def benchmark_one_step(sess,
-                       fetches,
-                       step,
-                       batch_size,
-                       step_train_times,
-                       trace_filename,
-                       partitioned_graph_file_prefix,
-                       profiler,
-                       image_producer,
-                       params,
-                       summary_op=None,
-                       show_images_per_sec=True,
-                       benchmark_logger=None,
-                       collective_graph_key=0):
-  """Advance one step of benchmarking."""
-  should_profile = profiler and 0 <= step < _NUM_STEPS_TO_PROFILE
-  need_options_and_metadata = (
-      should_profile or collective_graph_key > 0 or
-      ((trace_filename or partitioned_graph_file_prefix) and step == -2)
-  )
-  if need_options_and_metadata:
-    run_options = tf.RunOptions()
-    if (trace_filename and step == -2) or should_profile:
-      run_options.trace_level = tf.RunOptions.FULL_TRACE
-    if partitioned_graph_file_prefix and step == -2:
-      run_options.output_partition_graphs = True
-    if collective_graph_key > 0:
-      run_options.experimental.collective_graph_key = collective_graph_key
-    run_metadata = tf.RunMetadata()
-  else:
-    run_options = None
-    run_metadata = None
-  summary_str = None
-  start_time = time.time()
-  if summary_op is None:
-    results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
-  else:
-    (results, summary_str) = sess.run(
-        [fetches, summary_op], options=run_options, run_metadata=run_metadata)
-  if not params.forward_only:
-    lossval = results['average_loss']
-  else:
-    lossval = 0.
-  if image_producer is not None:
-    image_producer.notify_image_consumption()
-  train_time = time.time() - start_time
-  step_train_times.append(train_time)
-  if (show_images_per_sec and step >= 0 and
-      (step == 0 or (step + 1) % params.display_every == 0)):
-    speed_mean, speed_uncertainty, speed_jitter = get_perf_timing(
-        batch_size, step_train_times, params.display_perf_ewma)
-    log_str = '%i\t%s\t%.*f' % (
-        step + 1,
-        get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter),
-        LOSS_AND_ACCURACY_DIGITS_TO_SHOW, lossval)
-    if 'top_1_accuracy' in results:
-      log_str += '\t%.*f\t%.*f' % (
-          LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_1_accuracy'],
-          LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_5_accuracy'])
-    log_fn(log_str)
-    if benchmark_logger:
-      benchmark_logger.log_metric(
-          'current_examples_per_sec', speed_mean, global_step=step + 1)
-      if 'top_1_accuracy' in results:
-        benchmark_logger.log_metric(
-            'top_1_accuracy', results['top_1_accuracy'], global_step=step + 1)
-        benchmark_logger.log_metric(
-            'top_5_accuracy', results['top_5_accuracy'], global_step=step + 1)
-  if need_options_and_metadata:
-    if should_profile:
-      profiler.add_step(step, run_metadata)
-    if trace_filename and step == -2:
-      log_fn('Dumping trace to %s' % trace_filename)
-      trace_dir = os.path.dirname(trace_filename)
-      if not gfile.Exists(trace_dir):
-        gfile.MakeDirs(trace_dir)
-      with gfile.Open(trace_filename, 'w') as trace_file:
-        if params.use_chrome_trace_format:
-          trace = timeline.Timeline(step_stats=run_metadata.step_stats)
-          trace_file.write(trace.generate_chrome_trace_format(show_memory=True))
-        else:
-          trace_file.write(str(run_metadata.step_stats))
-    if partitioned_graph_file_prefix and step == -2:
-      path, filename = os.path.split(partitioned_graph_file_prefix)
-      if '.' in filename:
-        base_filename, ext = filename.rsplit('.', 1)
-        ext = '.' + ext
-      else:
-        base_filename, ext = filename, ''
-      as_text = filename.endswith('txt')
-      for graph_def in run_metadata.partition_graphs:
-        device = graph_def.node[0].device.replace('/', '_').replace(':', '_')
-        graph_filename = '%s%s%s' % (base_filename, device, ext)
-        log_fn('Writing partitioned GraphDef as %s to %s' % (
-            'text' if as_text else 'binary',
-            os.path.join(path, graph_filename)))
-        tf.train.write_graph(graph_def, path, graph_filename, as_text)
-  return (summary_str, lossval)
-def get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter, scale=1):
-  if scale == 1:
-    # TODO(laigd): rename 'images' to maybe 'inputs', same below.
-    return ('images/sec: %.1f +/- %.1f (jitter = %.1f)' %
-            (speed_mean, speed_uncertainty, speed_jitter))
-  else:
-    return 'images/sec: %.1f' % speed_mean
-def get_perf_timing(batch_size, step_train_times, ewma_alpha=None, scale=1):
-  """Calculate benchmark processing speed."""
-  times = np.array(step_train_times)
-  speeds = batch_size / times
-  if ewma_alpha:
-    weights = np.logspace(len(times)-1, 0, len(times), base=1-ewma_alpha)
-    time_mean = np.average(times, weights=weights)
-  else:
-    time_mean = np.mean(times)
-  speed_mean = scale * batch_size / time_mean
-  speed_uncertainty = np.std(speeds) / np.sqrt(float(len(speeds)))
-  speed_jitter = 1.4826 * np.median(np.abs(speeds - np.median(speeds)))
-  return speed_mean, speed_uncertainty, speed_jitter
-def load_checkpoint(saver, sess, ckpt_dir):
-  """Loads checkpoint from provided directory or full path.
-  Args:
-    saver: Saver used to restore the checkpoint.
-    sess: TensorFlow session.
-    ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
-  Returns:
-    Global step.
-  """
-  model_checkpoint_path = _get_checkpoint_to_load(ckpt_dir)
-  global_step = model_checkpoint_path.split('/')[-1].split('-')[-1]
-  if not global_step.isdigit():
-    global_step = 0
-  else:
-    global_step = int(global_step)
-  saver.restore(sess, model_checkpoint_path)
-  log_fn('Successfully loaded model from %s.' % model_checkpoint_path)
-  return global_step
-def _get_checkpoint_to_load(ckpt_dir):
-  """Returns which checkpoint to load.
-  Args:
-    ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
-  Returns:
-    Full path to checkpoint to load.
-  Raises:
-    CheckpointNotFoundException: If checkpoint is not found.
-  """
-  p = re.compile(r'ckpt-\d+$')
-  if p.search(ckpt_dir):
-    model_checkpoint_path = ckpt_dir
-  else:
-    # Finds latest checkpoint in directory provided
-    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
-    if ckpt and ckpt.model_checkpoint_path:
-      model_checkpoint_path = ckpt.model_checkpoint_path
-    else:
-      raise CheckpointNotFoundException('No checkpoint file found in dir:{}'.
-                                        format(ckpt_dir))
-  return model_checkpoint_path
-# Params are passed to BenchmarkCNN's constructor. Params is a map from name
-# to value, with one field per key in flags.param_specs.
-#
-# Call make_params() or make_params_from_flags() below to construct a Params
-# tuple with default values from flags.param_specs, rather than constructing
-# Params directly.
-Params = namedtuple('Params', flags.param_specs.keys())  # pylint: disable=invalid-name
-def validate_params(params):
-  """Validates that the Params tuple had valid values.
-  When command-line flags are defined for each ParamSpec by calling
-  flags.define_flags(), calling this function is unnecessary because absl
-  already does flag validation. Otherwise, this function should be called.
-  Args:
-     params: A Params tuple.
-  Raises:
-    ValueError: An element of params had an invalid value.
-  """
-  for name, value in params._asdict().items():
-    param_spec = flags.param_specs[name]
-    if param_spec.flag_type in ('integer', 'float'):
-      if (value is not None and param_spec.kwargs['lower_bound'] is not None and
-          value < param_spec.kwargs['lower_bound']):
-        raise ValueError('Param %s value of %s is lower than the lower bound '
-                         'of %s' %
-                         (name, value, param_spec.kwargs['lower_bound']))
-      if (value is not None and param_spec.kwargs['upper_bound'] is not None and
-          param_spec.kwargs['upper_bound'] < value):
-        raise ValueError('Param %s value of %s is higher than the upper bound '
-                         'of %s' %
-                         (name, value, param_spec.kwargs['upper_bound']))
-    elif (value is not None and param_spec.flag_type == 'enum' and
-          value not in param_spec.kwargs['enum_values']):
-      raise ValueError('Param %s of value %s is not in %s'%
-                       (name, value, param_spec.kwargs['enum_values']))
-def make_params(**kwargs):
-  """Create a Params tuple for BenchmarkCNN from kwargs.
-  Default values are filled in from flags.param_specs.
-  Args:
-    **kwargs: kwarg values will override the default values.
-  Returns:
-    Params namedtuple for constructing BenchmarkCNN.
-  """
-  # Create a (name: default_value) map from flags.param_specs.
-  default_kwargs = {
-      name: flags.param_specs[name].default_value
-      for name in flags.param_specs
-  }
-  params = Params(**default_kwargs)._replace(**kwargs)
-  validate_params(params)
-  return params
-def make_params_from_flags():
-  """Create a Params tuple for BenchmarkCNN from absl_flags.FLAGS.
-  Returns:
-    Params namedtuple for constructing BenchmarkCNN.
-  """
-  # Collect (name: value) pairs for absl_flags.FLAGS with matching names in
-  # flags.param_specs.
-  flag_values = {name: getattr(absl_flags.FLAGS, name)
-                 for name in flags.param_specs.keys()}
-  return Params(**flag_values)
-def remove_param_fields(params, fields_to_remove):
-  """Remove fields from a Params namedtuple."""
-  params_dict = params._asdict()
-  for field in fields_to_remove:
-    assert field in params_dict, 'Invalid Params field: ' + field
-  params_dict = {k: v for k, v in params_dict.items()
-                 if k not in fields_to_remove}
-  new_params_type = namedtuple('Params', params_dict.keys())
-  return new_params_type(**params_dict)
-def get_num_batches_and_epochs(params, batch_size, num_examples_per_epoch):
-  """Returns the number of batches and epochs to run for.
-  Args:
-    params: Params tuple, typically created by make_params or
-      make_params_from_flags.
-    batch_size: The number of images per step.
-    num_examples_per_epoch: The number of images in a single epoch.
-  Returns:
-    num_batches: The number of batches to run for.
-    num_epochs: The number of epochs to run for. This might be slightly
-      smaller than params.num_epochs if specified, because the number of batches
-      must be an integer.
-  Raises:
-    ValueError: Invalid or unsupported params.
-  """
-  if params.num_batches and params.num_epochs:
-    raise ValueError('At most one of --num_batches and --num_epochs may be '
-                     'specified.')
-  if params.num_epochs:
-    num_batches = int(params.num_epochs * num_examples_per_epoch +
-                      batch_size - 1) // batch_size
-  else:
-    num_batches = params.num_batches or _DEFAULT_NUM_BATCHES
-  num_epochs = num_batches * batch_size / num_examples_per_epoch
-  return (num_batches, num_epochs)
-def get_piecewise_learning_rate(piecewise_learning_rate_schedule,
-                                global_step, num_batches_per_epoch):
-  """Returns a piecewise learning rate tensor.
-  Args:
-    piecewise_learning_rate_schedule: The --piecewise_learning_rate_schedule
-      parameter
-    global_step: Scalar tensor representing the global step.
-    num_batches_per_epoch: float indicating the number of batches per epoch.
-  Returns:
-    A scalar float tensor, representing the learning rate.
-  Raises:
-    ValueError: piecewise_learning_rate_schedule is not formatted correctly.
-  """
-  pieces = piecewise_learning_rate_schedule.split(';')
-  if len(pieces) % 2 == 0:
-    raise ValueError('--piecewise_learning_rate_schedule must have an odd '
-                     'number of components')
-  values = []
-  boundaries = []
-  for i, piece in enumerate(pieces):
-    if i % 2 == 0:
-      try:
-        values.append(float(piece))
-      except ValueError:
-        raise ValueError('Invalid learning rate: ' + piece)
-    else:
-      try:
-        boundaries.append(int(int(piece) * num_batches_per_epoch) - 1)
-      except ValueError:
-        raise ValueError('Invalid epoch: ' + piece)
-  return tf.train.piecewise_constant(global_step, boundaries, values,
-                                     name='piecewise_learning_rate')
-def get_learning_rate(params, global_step, num_examples_per_epoch, model,
-                      batch_size):
-  """Returns a learning rate tensor based on global_step.
-  Args:
-    params: Params tuple, typically created by make_params or
-      make_params_from_flags.
-    global_step: Scalar tensor representing the global step.
-    num_examples_per_epoch: The number of examples per epoch.
-    model: The model.Model object to obtain the default learning rate from if no
-      learning rate is specified.
-    batch_size: Number of examples per step
-  Returns:
-    A scalar float tensor, representing the learning rate. When evaluated, the
-    learning rate depends on the current value of global_step.
-  Raises:
-    ValueError: Invalid or unsupported params.
-  """
-  with tf.name_scope('learning_rate'):
-    num_batches_per_epoch = num_examples_per_epoch / batch_size
-    if params.piecewise_learning_rate_schedule:
-      if (params.init_learning_rate is not None or
-          params.learning_rate_decay_factor or
-          params.minimum_learning_rate or params.num_epochs_per_decay):
-        raise ValueError('No other learning rate-related flags can be '
-                         'specified if --piecewise_learning_rate_schedule is '
-                         'specified')
-      learning_rate = get_piecewise_learning_rate(
-          params.piecewise_learning_rate_schedule,
-          global_step, num_batches_per_epoch)
-    elif params.init_learning_rate is not None:
-      learning_rate = params.init_learning_rate
-      if (params.num_epochs_per_decay > 0 and
-          params.learning_rate_decay_factor > 0):
-        decay_steps = int(num_batches_per_epoch * params.num_epochs_per_decay)
-        # Decay the learning rate exponentially based on the number of steps.
-        learning_rate = tf.train.exponential_decay(
-            params.init_learning_rate,
-            global_step,
-            decay_steps,
-            params.learning_rate_decay_factor,
-            staircase=True)
-        if params.minimum_learning_rate != 0.:
-          learning_rate = tf.maximum(learning_rate,
-                                     params.minimum_learning_rate)
-    else:
-      learning_rate = model.get_learning_rate(global_step, batch_size)
-    if params.num_learning_rate_warmup_epochs > 0 and (
-        params.init_learning_rate is not None or
-        params.piecewise_learning_rate_schedule):
-      warmup_steps = int(num_batches_per_epoch *
-                         params.num_learning_rate_warmup_epochs)
-      init_lr = params.init_learning_rate
-      if init_lr is None:
-        init_lr = float(params.piecewise_learning_rate_schedule.split(';')[0])
-      warmup_lr = init_lr * tf.cast(global_step, tf.float32) / tf.cast(
-          warmup_steps, tf.float32)
-      learning_rate = tf.cond(global_step < warmup_steps,
-                              lambda: warmup_lr, lambda: learning_rate)
-    learning_rate = mlperf.logger.log_deferred_tensor_value(
-        mlperf.tags.OPT_LR, learning_rate, global_step, every_n=100)
-  return learning_rate
-def get_optimizer(params, learning_rate):
-  """Returns the optimizer that should be used based on params."""
-  if params.optimizer == 'momentum':
-    mlperf.logger.log(key=mlperf.tags.OPT_NAME,
-                      value=mlperf.tags.SGD_WITH_MOMENTUM)
-    mlperf.logger.log(key=mlperf.tags.OPT_MOMENTUM, value=params.momentum)
-    opt = tf.train.MomentumOptimizer(
-        learning_rate, params.momentum, use_nesterov=True)
-  elif params.optimizer == 'sgd':
-    mlperf.logger.log(key=mlperf.tags.OPT_NAME, value=mlperf.tags.SGD)
-    opt = tf.train.GradientDescentOptimizer(learning_rate)
-  elif params.optimizer == 'rmsprop':
-    opt = tf.train.RMSPropOptimizer(
-        learning_rate,
-        params.rmsprop_decay,
-        momentum=params.rmsprop_momentum,
-        epsilon=params.rmsprop_epsilon)
-  elif params.optimizer == 'adam':
-    opt = tf.train.AdamOptimizer(learning_rate, params.adam_beta1,
-                                 params.adam_beta2, params.adam_epsilon)
-  else:
-    raise ValueError('Optimizer "{}" was not recognized'.
-                     format(params.optimizer))
-  return opt
-def generate_tfprof_profile(profiler, tfprof_file):
-  """Generates a tfprof profile, writing it to a file and printing top ops.
-  Args:
-    profiler: A tf.profiler.Profiler. `profiler.add_step` must have already been
-      called.
-    tfprof_file: The filename to write the ProfileProto to.
-  """
-  profile_proto = profiler.serialize_to_string()
-  log_fn('Dumping ProfileProto to %s' % tfprof_file)
-  with gfile.Open(tfprof_file, 'wb') as f:
-    f.write(profile_proto)
-  # Print out the execution times of the top operations. Note this
-  # information can also be obtained with the dumped ProfileProto, but
-  # printing it means tfprof doesn't have to be used if all the user wants
-  # is the top ops.
-  options = tf.profiler.ProfileOptionBuilder.time_and_memory()
-  options['max_depth'] = _NUM_OPS_TO_PRINT
-  options['order_by'] = 'accelerator_micros'
-  profiler.profile_operations(options)
-class BenchmarkCNN(object):
-  """Class for benchmarking a cnn network."""
-  def __init__(self, params, dataset=None, model=None):
-    """Initialize BenchmarkCNN.
-    Args:
-      params: Params tuple, typically created by make_params or
-              make_params_from_flags.
-      dataset: If not None, the dataset to use. Otherwise, params is used to
-               obtain the dataset.
-      model: If not None, the model to use. Otherwise, params is used to obtain
-             the model.
-    Raises:
-      ValueError: Unsupported params settings.
-    """
-    mlperf.logger.log(key=mlperf.tags.RUN_START)
-    self.params = params
-    if params.eval:
-      self._doing_eval = True
-    else:
-      # Note self._doing_eval can later switch to True in self._do_eval() if
-      # self.params.eval_during_training_* is specified.
-      self._doing_eval = False
-    self.dataset = dataset or datasets.create_dataset(self.params.data_dir,
-                                                      self.params.data_name)
-    self.model = model or model_config.get_model_config(
-        self.params.model, self.dataset, self.params)
-    self.trace_filename = self.params.trace_file
-    self.rewriter_config = self.params.rewriter_config
-    autotune_threshold = self.params.autotune_threshold if (
-        self.params.autotune_threshold) else 1
-    min_autotune_warmup = 5 * autotune_threshold * autotune_threshold
-    self.num_warmup_batches = self.params.num_warmup_batches if (
-        self.params.num_warmup_batches is not None) else max(
-            10, min_autotune_warmup)
-    self.graph_file = self.params.graph_file
-    self.resize_method = self.params.resize_method
-    self.sync_queue_counter = 0
-    self.num_gpus = self.params.num_gpus
-    if self.params.gpu_indices:
-      self.gpu_indices = [int(x) for x in self.params.gpu_indices.split(',')]
-    else:
-      self.gpu_indices = [x for x in range(self.num_gpus)]
-    if (self.params.device == 'cpu' and self.params.data_format == 'NCHW' and
-        not self.params.mkl):
-      raise ValueError('device=cpu requires that data_format=NHWC')
-    if ((self.params.num_epochs_per_decay or
-         self.params.learning_rate_decay_factor) and
-        not (self.params.init_learning_rate is not None and
-             self.params.num_epochs_per_decay
-             and self.params.learning_rate_decay_factor)):
-      raise ValueError('If one of num_epochs_per_decay or '
-                       'learning_rate_decay_factor is set, both must be set'
-                       'and learning_rate must be set')
-    if (self.params.minimum_learning_rate and
-        not (self.params.init_learning_rate is not None and
-             self.params.num_epochs_per_decay and
-             self.params.learning_rate_decay_factor)):
-      raise ValueError('minimum_learning_rate requires learning_rate,'
-                       'num_epochs_per_decay, and '
-                       'learning_rate_decay_factor to be set')
-    if (self.params.use_fp16 and self.params.fp16_vars and
-        'replicated' in self.params.variable_update and
-        self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec):
-      raise ValueError('fp16 variables are not supported with NCCL')
-    if (self.params.use_fp16 and self.params.fp16_vars and
-        self.params.gradient_repacking):
-      raise ValueError('--fp16_vars cannot be used with --gradient_repacking')
-    if self.params.variable_update == 'horovod' and self.params.num_gpus > 1:
-      raise ValueError('Horovod benchmarks require num_gpus=1 on each worker')
-    if self.params.variable_update == 'horovod' and self.params.job_name:
-      raise ValueError('job_name should not be specified for Horovod.')
-    if self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale:
-      if self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec:
-        raise ValueError('Automatic loss scaling is not supported with NCCL.')
-      if self.params.variable_update not in ('parameter_server', 'replicated',
-                                             'independent'):
-        raise ValueError('Automatic loss scaling is not supported with '
-                         'variable_update=%s.' % self.params.variable_update)
-      if self.params.staged_vars:
-        raise ValueError('Automatic loss scaling is not supported with'
-                         'staged_vars.')
-    if (self.params.debugger is not None and self.params.debugger != 'cli' and
-        ':' not in self.params.debugger):
-      raise ValueError('--debugger must be "cli" or in the form '
-                       'host:port')
-    if self.params.hierarchical_copy and self.params.num_gpus <= 1:
-      raise ValueError('--hierarchical_copy requires --num_gpus to be greater '
-                       'than 1')
-    if params.save_model_secs and params.save_model_steps:
-      raise ValueError('At most one of --save_model_secs and '
-                       '--save_model_steps can be specified')
-    eval_during_training_flags = list(map(bool, [
-        params.eval_during_training_every_n_steps,
-        params.eval_during_training_every_n_epochs,
-        params.eval_during_training_at_specified_steps,
-        params.eval_during_training_at_specified_epochs,
-    ]))
-    if eval_during_training_flags.count(True) > 1:
-      raise ValueError('At most one flag with --eval_during_training_* prefix '
-                       'must be specified.')
-    eval_during_training_enabled = any(eval_during_training_flags)
-    if eval_during_training_enabled:
-      if params.eval:
-        raise ValueError('At most one of --eval and --eval_during_training_* '
-                         'must be specified')
-      if params.forward_only:
-        raise ValueError('At most one of --forward_only and '
-                         '--eval_during_training_* must be specified')
-      if params.job_name:
-        raise ValueError('--eval_during_training_* is not yet supported in '
-                         'distributed mode.')
-      if params.staged_vars:
-        raise ValueError('--eval_during_training_* is not currently compatible '
-                         'with --staged_vars')
-    if params.stop_at_top_1_accuracy and not eval_during_training_enabled:
-      raise ValueError('--stop_at_top_1_accuracy is only supported with '
-                       '--eval_during_training_*')
-    if params.collect_eval_results_async and params.model != 'ssd300':
-      raise ValueError('--collect_eval_results_async only works with ssd300 '
-                       'model currently.')
-    if self.params.forward_only and self.params.freeze_when_forward_only:
-      if self.params.train_dir is not None:
-        raise ValueError('In forward_only mode, when --freeze_when_forward_only'
-                         ' is True, --train_dir should not be specified')
-      if self.params.data_dir and not self.params.datasets_use_prefetch:
-        raise ValueError('In forward_only mode, when --freeze_when_forward_only'
-                         ' is True and --data_dir is set, '
-                         '--datasets_use_prefetch should be set to True')
-      if self.params.job_name:
-        raise ValueError('In forward_only mode, when --freeze_when_forward_only'
-                         ' is True, --job_name should not be specified and '
-                         'distributed running is not supported')
-      self.forward_only_and_freeze = True
-    else:
-      self.forward_only_and_freeze = False
-      if self.params.trt_mode:
-        raise ValueError('--trt_mode should not be specified if one of '
-                         '--forward_only and --freeze_when_forward_only is set '
-                         'to False')
-    self.mode = get_mode_from_params(self.params)
-    # Use the batch size from the command line if specified, otherwise use the
-    # model's default batch size.  Scale the benchmark's batch size by the
-    # number of GPUs.
-    if self.params.batch_size > 0:
-      self.model.set_batch_size(self.params.batch_size)
-    self.batch_size = self.model.get_batch_size() * self.num_gpus
-    if self.mode in (constants.BenchmarkMode.TRAIN,
-                     constants.BenchmarkMode.TRAIN_AND_EVAL):
-      self.train_batch_size = self.batch_size
-    else:
-      self.train_batch_size = None
-    if self.mode in (constants.BenchmarkMode.EVAL,
-                     constants.BenchmarkMode.TRAIN_AND_EVAL):
-      if self.params.eval_batch_size > 0:
-        self.eval_batch_size = self.params.eval_batch_size * self.num_gpus
-      else:
-        self.eval_batch_size = self.batch_size
-    else:
-      self.eval_batch_size = None
-    self.batch_group_size = self.params.batch_group_size
-    self.enable_auto_loss_scale = (
-        self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale)
-    self.loss_scale = None
-    self.loss_scale_normal_steps = None
-    self.job_name = self.params.job_name  # "" for local training
-    # PS server is used for distributed jobs not using all-reduce.
-    use_ps_server = self.job_name and (self.params.variable_update !=
-                                       'distributed_all_reduce' and
-                                       self.params.variable_update !=
-                                       'collective_all_reduce')
-    # controller is used for distributed_all_reduce with > 1 worker.
-    use_controller = (
-        self.params.variable_update == 'distributed_all_reduce' and
-        self.job_name)
-    if use_controller and not params.controller_host:
-      raise ValueError('When variable_update==distributed_all_reduce '
-                       'controller_host must also be specified.')
-    # collective_all_reduce doesn't need a controller or ps
-    self.distributed_collective = (
-        self.params.variable_update == 'collective_all_reduce' and
-        self.job_name)
-    self.local_parameter_device_flag = self.params.local_parameter_device
-    if self.job_name:
-      self.task_index = self.params.task_index
-      self.cluster_manager = platforms_util.get_cluster_manager(
-          params, create_config_proto(params))
-      assert isinstance(self.cluster_manager, cnn_util.BaseClusterManager)
-      worker_prefix = '/job:worker/replica:0/task:%s' % self.task_index
-      if use_ps_server:
-        self.param_server_device = tf.train.replica_device_setter(
-            worker_device=worker_prefix + '/cpu:0',
-            cluster=self.cluster_manager.get_cluster_spec())
-        # This device on which the queues for managing synchronization between
-        # servers should be stored.
-        self.sync_queue_devices = [
-            '/job:ps/replica:0/task:%s/cpu:0' % i
-            for i in range(self.cluster_manager.num_ps())
-        ]
-      else:
-        self.sync_queue_devices = ['/job:worker/replica:0/task:0/cpu:0']
-    else:
-      self.task_index = 0
-      self.cluster_manager = None
-      worker_prefix = ''
-      self.param_server_device = '/%s:0' % self.params.local_parameter_device
-      self.sync_queue_devices = [self.param_server_device]
-    if self.cluster_manager:
-      self.num_workers = self.cluster_manager.num_workers()
-    elif self.params.variable_update == 'horovod':
-      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-      self.num_workers = hvd.size()
-    else:
-      self.num_workers = 1
-    self.num_ps = self.cluster_manager.num_ps() if self.cluster_manager else 0
-    if self.num_workers > 1 and self.params.all_reduce_spec == 'nccl':
-      raise ValueError('--all_reduce_spec=nccl is invalid in a '
-                       'multi-worker job')
-    # Device to use for ops that need to always run on the local worker's CPU.
-    self.cpu_device = '%s/cpu:0' % worker_prefix
-    # Device to use for ops that need to always run on the local worker's
-    # compute device, and never on a parameter server device.
-    self.raw_devices = [
-        '%s/%s:%i' % (worker_prefix, self.params.device, i)
-        for i in xrange(self.num_gpus)
-    ]
-    subset = 'validation' if params.eval else 'train'
-    self.num_batches, self.num_epochs = get_num_batches_and_epochs(
-        params, self.batch_size * self.num_workers,
-        self.dataset.num_examples_per_epoch(subset))
-    if self.mode in (constants.BenchmarkMode.EVAL,
-                     constants.BenchmarkMode.TRAIN_AND_EVAL):
-      # TODO(reedwm): Currently we do extra eval logic for num_eval_batches and
-      # the preprocessor. We should encapsulate this logic into a shared
-      # function or class.
-      if params.num_eval_batches is None and params.num_eval_epochs is None:
-        eval_params = self.params
-      else:
-        eval_params = self.params._replace(
-            num_batches=self.params.num_eval_batches,
-            num_epochs=self.params.num_eval_epochs)
-      self.num_eval_batches, self.num_eval_epochs = get_num_batches_and_epochs(
-          eval_params, self.eval_batch_size * self.num_workers,
-          self.dataset.num_examples_per_epoch('validation'))
-    else:
-      self.num_eval_batches, self.num_eval_epochs = None, None
-    num_train_examples_per_epoch = self.dataset.num_examples_per_epoch('train')
-    if self.params.eval_during_training_every_n_epochs:
-      n_epochs = self.params.eval_during_training_every_n_epochs
-      self.eval_during_training_at_specified_steps = {
-          (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
-           self.batch_size)
-          for e in np.arange(n_epochs, self.num_epochs, n_epochs)}
-    if self.params.eval_during_training_at_specified_steps:
-      try:
-        self.eval_during_training_at_specified_steps = set(map(
-            int, self.params.eval_during_training_at_specified_steps))
-      except ValueError:
-        raise ValueError('Param eval_during_training_at_specified_steps value '
-                         'of %s cannot be converted to a list of integers.' %
-                         (self.params.eval_during_training_at_specified_steps))
-    if self.params.eval_during_training_at_specified_epochs:
-      try:
-        n_epochs = list(map(
-            float, self.params.eval_during_training_at_specified_epochs))
-        offset = n_epochs[0] - 1
-        if offset.is_integer():
-          offset = int(offset)
-        mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
-        self.eval_during_training_at_specified_steps = {
-            (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
-             self.batch_size)
-            for e in n_epochs}
-      except ValueError:
-        raise ValueError('Param eval_during_training_at_specified_epochs value '
-                         'of %s cannot be converted to a list of floats.' %
-                         (self.params.eval_during_training_at_specified_epochs))
-    if params.eval_during_training_every_n_epochs:
-      offset = params.eval_during_training_every_n_epochs - 1
-      if offset.is_integer():
-        offset = int(offset)
-      mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
-    if (self.params.staged_vars and
-        self.params.variable_update != 'parameter_server'):
-      raise ValueError('staged_vars for now is only supported with '
-                       'variable_update=parameter_server')
-    if self.params.variable_update == 'parameter_server':
-      if self.job_name:
-        if not self.params.staged_vars:
-          self.variable_mgr = variable_mgr.VariableMgrDistributedFetchFromPS(
-              self)
-        else:
-          self.variable_mgr = (
-              variable_mgr.VariableMgrDistributedFetchFromStagedPS(self))
-      else:
-        if not self.params.staged_vars:
-          self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(self)
-        else:
-          self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromStagedPS(
-              self)
-    elif self.params.variable_update == 'replicated':
-      if self.job_name:
-        raise ValueError('Invalid variable_update in distributed mode: %s' %
-                         self.params.variable_update)
-      self.variable_mgr = variable_mgr.VariableMgrLocalReplicated(
-          self, self.params.all_reduce_spec,
-          self.params.agg_small_grads_max_bytes,
-          self.params.agg_small_grads_max_group,
-          self.params.allreduce_merge_scope)
-    elif self.params.variable_update == 'distributed_all_reduce':
-      assert self.params.cross_replica_sync
-      self.variable_mgr = variable_mgr.VariableMgrDistributedAllReduce(
-          self, self.params.all_reduce_spec,
-          ('worker' if self.num_workers > 1 else 'localhost'),
-          self.num_workers, self.params.agg_small_grads_max_bytes,
-          self.params.agg_small_grads_max_group,
-          self.params.allreduce_merge_scope)
-    elif self.params.variable_update == 'collective_all_reduce':
-      assert self.params.cross_replica_sync
-      self.variable_mgr = variable_mgr.VariableMgrCollectiveAllReduce(
-          self, self.params.all_reduce_spec,
-          self.num_workers, self.num_gpus, self.task_index,
-          self.params.allreduce_merge_scope)
-    elif self.params.variable_update == 'distributed_replicated':
-      assert self.params.cross_replica_sync
-      if not self.job_name:
-        raise ValueError('Invalid variable_update in local mode: %s' %
-                         self.params.variable_update)
-      self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated(self)
-    elif self.params.variable_update in ('independent', 'horovod'):
-      if self.job_name:
-        raise ValueError('Invalid variable_update in distributed mode: %s' %
-                         self.params.variable_update)
-      self.variable_mgr = variable_mgr.VariableMgrIndependent(self)
-    else:
-      raise ValueError(
-          'Invalid variable_update: %s' % self.params.variable_update)
-    # Device to use for running on the local worker's compute device, but
-    # with variables assigned to parameter server devices.
-    self.devices = self.variable_mgr.get_devices()
-    if self.job_name:
-      if use_ps_server:
-        self.global_step_device = self.param_server_device
-      elif self.params.variable_update == 'collective_all_reduce':
-        self.global_step_device = self.cpu_device
-      else:
-        self.global_step_device = '/job:worker/replica:0/task:0/cpu:0'
-    else:
-      self.global_step_device = self.cpu_device
-    self.input_preprocessor = None
-    self.eval_input_preprocessor = None
-    if not self.dataset.use_synthetic_gpu_inputs():
-      if not self.params.eval:
-        self.input_preprocessor = self.get_input_preprocessor()
-      if self.mode in (constants.BenchmarkMode.EVAL,
-                       constants.BenchmarkMode.TRAIN_AND_EVAL):
-        with self._do_eval():
-          self.eval_input_preprocessor = self.get_input_preprocessor()
-    self.datasets_use_prefetch = (
-        self.params.datasets_use_prefetch and
-        # TODO(rohanj): Figure out why --datasets_use_prefetch freezes on the
-        # CPU.
-        self.params.device.lower() != 'cpu' and
-        self.input_preprocessor and
-        self.input_preprocessor.supports_datasets())
-    self.init_global_step = 0
-    self._config_benchmark_logger()
-    if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
-      # Remove "eval" from params so it is not accidentally used. Since eval can
-      # still occur despite params.eval being False, params.eval should never
-      # be used. We cannot yet remove this unconditionally, because the SSD
-      # model still uses params.eval, and hence does not work properly with
-      # --eval_during_training_*.
-      # TODO(b/116627045): We should also remove fields that have an eval
-      # equivalent, like num_batches and num_eval_batches.
-      self.params = remove_param_fields(self.params, {'eval'})
-  @contextlib.contextmanager
-  def _do_eval(self):
-    """Context manager to switches BenchmarkCNN to eval mode.
-    Any evaluation code should be put under this context manager. This context
-    manager switches self._doing_eval to True. It also switches certain
-    attributes, like self.num_batches and self.num_epochs, to be the number of
-    batches and epochs for evaluation respectively
-    Yields:
-      Nothing.
-    """
-    # TODO(b/116627045): Find a more general way of switching attributes to the
-    # eval equivalents.
-    old_doing_eval = self._doing_eval
-    old_num_batches = self.num_batches
-    old_num_epochs = self.num_epochs
-    old_batch_size = self.batch_size
-    try:
-      self._doing_eval = True
-      self.num_batches = self.num_eval_batches
-      self.num_epochs = self.num_eval_epochs
-      self.batch_size = self.eval_batch_size
-      self.model.set_batch_size(self.eval_batch_size // self.num_gpus)
-      yield
-    finally:
-      self._doing_eval = old_doing_eval
-      self.num_batches = old_num_batches
-      self.num_epochs = old_num_epochs
-      self.batch_size = old_batch_size
-      self.model.set_batch_size(old_batch_size // self.num_gpus)
-  def _config_benchmark_logger(self):
-    """Config the model garden benchmark logger."""
-    model_benchmark_logger = None
-    if self.params.benchmark_log_dir is not None:
-      try:
-        from official.utils.logs import logger as models_logger  # pylint: disable=g-import-not-at-top
-      except ImportError:
-        tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH '
-                         'in order to use BenchmarkLogger. Configured '
-                         'benchmark_log_dir: %s'
-                         % self.params.benchmark_log_dir)
-        raise
-      model_benchmark_logger = models_logger.BenchmarkFileLogger(
-          self.params.benchmark_log_dir)
-    self.benchmark_logger = model_benchmark_logger
-  # TODO(laigd): this changes the global device list which is used everywhere,
-  # consider refactoring it.
-  def reset_devices_for_task(self, task_num, is_local=False):
-    """Used to imitate another task when building a distributed graph."""
-    worker_prefix = ('/job:localhost' if is_local else
-                     '/job:worker/replica:0/task:%s' % task_num)
-    self.cpu_device = '%s/cpu:0' % worker_prefix
-    self.raw_devices = [
-        '%s/%s:%i' % (worker_prefix, self.params.device, i)
-        for i in xrange(self.num_gpus)
-    ]
-    self.devices = self.variable_mgr.get_devices()
-  def raw_devices_across_tasks(self, is_local=False):
-    """Returns list of raw device names across all tasks."""
-    if is_local:
-      assert self.num_workers == 1
-      return self.raw_devices
-    else:
-      return [
-          'job:worker/replica:0/task%s/%s:%i' % (t, self.params.device, i)
-          for t in xrange(self.num_workers)
-          for i in xrange(self.num_gpus)
-      ]
-  def print_info(self):
-    """Print basic information."""
-    benchmark_info = self._get_params_info()
-    log_fn('Model:       %s' % self.model.get_model_name())
-    log_fn('Dataset:     %s' % benchmark_info['dataset_name'])
-    log_fn('Mode:        %s' % self.mode)
-    log_fn('SingleSess:  %s' % benchmark_info['single_session'])
-    log_fn('Batch size:  %s global' % (self.batch_size * self.num_workers))
-    log_fn('             %s per device' % (self.batch_size //
-                                           len(self.raw_devices)))
-    if self.batch_group_size > 1:
-      log_fn('             %d batches per prepocessing group' %
-             self.batch_group_size)
-    log_fn('Num batches: %d' % self.num_batches)
-    log_fn('Num epochs:  %.2f' % self.num_epochs)
-    log_fn('Devices:     %s' % benchmark_info['device_list'])
-    log_fn('NUMA bind:   %s' % self.params.use_numa_affinity)
-    log_fn('Data format: %s' % self.params.data_format)
-    if self.rewriter_config:
-      log_fn('RewriterConfig: %s' % self.rewriter_config)
-    log_fn('Optimizer:   %s' % self.params.optimizer)
-    log_fn('Variables:   %s' % self.params.variable_update)
-    if (self.params.variable_update == 'replicated' or
-        self.params.variable_update == 'distributed_all_reduce'
-        or self.params.variable_update == 'collective_all_reduce'):
-      log_fn('AllReduce:   %s' % self.params.all_reduce_spec)
-    if self.job_name:
-      log_fn('Sync:        %s' % self.params.cross_replica_sync)
-    if self.params.staged_vars:
-      log_fn('Staged vars: %s' % self.params.staged_vars)
-    if self.params.variable_update == 'horovod' and self.params.horovod_device:
-      log_fn('Horovod on:  %s' % self.params.horovod_device)
-    log_fn('==========')
-  def _get_params_info(self):
-    """Get the common parameters info for the benchmark run.
-    Returns:
-      A dict of processed parameters.
-    """
-    dataset_name = self.dataset.name
-    if self.dataset.use_synthetic_gpu_inputs():
-      dataset_name += ' (synthetic)'
-    single_session = self.params.variable_update == 'distributed_all_reduce'
-    if single_session:
-      device_list = self.raw_devices_across_tasks()
-    elif self.params.variable_update == 'horovod':
-      device_list = ['horovod/%s:%d' % (self.params.device, idx)
-                     for idx in range(self.num_workers)]
-    else:
-      device_list = self.raw_devices
-    return {
-        'dataset_name': dataset_name,
-        'single_session': single_session,
-        'device_list': device_list,}
-  def _log_benchmark_run(self):
-    """Log the benchmark info to the logger.
-    The info logged here should be similar to print_info(), but in a structured
-    JSON format.
-    """
-    if self.benchmark_logger:
-      benchmark_info = self._get_params_info()
-      run_param = {
-          'model': self.model.get_model_name(),
-          'dataset': benchmark_info['dataset_name'],
-          'mode': self.mode,
-          'single_sess': benchmark_info['single_session'],
-          'devices': benchmark_info['device_list'],
-          'batch_size': self.batch_size,
-          'batch_size_per_device': self.batch_size // len(self.raw_devices),
-          'num_batches': self.num_batches,
-          'num_epochs': self.num_epochs,
-          'data_format': self.params.data_format,
-          'rewrite_config': self.rewriter_config,
-          'optimizer': self.params.optimizer,
-          'session_config': create_config_proto(self.params),
-      }
-      # TODO(scottzhu): tf_cnn_benchmark might execute several times with
-      # different param setting on the same box. This will cause the run file to
-      # only contain the latest info. The benchmark_log_dir should be updated
-      # for every new run.
-      self.benchmark_logger.log_run_info(
-          self.model.get_model_name(), benchmark_info['dataset_name'],
-          run_param, test_id=self.params.benchmark_test_id)
-  def run(self):
-    """Run the benchmark task assigned to this process.
-    Returns:
-      Dictionary of statistics for training or eval.
-    Raises:
-       ValueError: unrecognized job name.
-    """
-    if self.params.job_name == 'ps':
-      log_fn('Running parameter server %s' % self.task_index)
-      self.cluster_manager.join_server()
-      return {}
-    # For distributed_all_reduce with multiple workers, drive
-    # from a separate controller process.
-    if self.params.variable_update == 'distributed_all_reduce':
-      if self.params.job_name == 'worker':
-        log_fn('Starting worker %s' % self.task_index)
-        self.cluster_manager.join_server()
-        return
-      elif self.params.job_name and self.params.job_name != 'controller':
-        raise ValueError('unrecognized job name: %s' % self.params.job_name)
-    self._log_benchmark_run()
-    if self._doing_eval:
-      with tf.Graph().as_default():
-        # TODO(laigd): freeze the graph in eval mode.
-        return self._run_eval()
-    else:
-      return self._benchmark_train()
-  def _run_eval(self):
-    """Evaluate a model every self.params.eval_interval_secs.
-    Returns:
-      Dictionary containing eval statistics. Currently returns an empty
-      dictionary.
-    Raises:
-      ValueError: If self.params.train_dir is unspecified.
-    """
-    if self.params.train_dir is None:
-      raise ValueError('Trained model directory not specified')
-    graph_info = self._build_eval_graph()
-    saver = tf.train.Saver(self.variable_mgr.savable_variables())
-    summary_writer = tf.summary.FileWriter(self.params.eval_dir,
-                                           tf.get_default_graph())
-    target = ''
-    # TODO(huangyp): Check if checkpoints haven't updated for hours and abort.
-    while True:
-      with tf.Session(
-          target=target, config=create_config_proto(self.params)) as sess:
-        image_producer = None
-        try:
-          global_step = load_checkpoint(saver, sess, self.params.train_dir)
-          image_producer = self._initialize_eval_graph(
-              graph_info.enqueue_ops, graph_info.input_producer_op,
-              graph_info.local_var_init_op_group, sess)
-        except CheckpointNotFoundException:
-          log_fn('Checkpoint not found in %s' % self.params.train_dir)
-        else:  # Only executes if an exception was not thrown
-          self._eval_once(sess, summary_writer, graph_info.fetches,
-                          graph_info.summary_op, image_producer, global_step)
-        if image_producer is not None:
-          image_producer.done()
-        if self.params.eval_interval_secs <= 0:
-          break
-        time.sleep(self.params.eval_interval_secs)
-    return {}
-  def _build_eval_graph(self, scope_name=None):
-    """Build the evaluation graph.
-    Args:
-      scope_name: String to filter what summaries are collected. Only summary
-        ops whose name contains `scope_name` will be added, which is useful for
-        only including evaluation ops.
-    Returns:
-      A GraphInfo named_tuple containing various useful ops and tensors of the
-      evaluation grpah.
-    """
-    with self._do_eval():
-      input_producer_op, enqueue_ops, fetches = self._build_model()
-      local_var_init_op = tf.local_variables_initializer()
-      table_init_ops = tf.tables_initializer()
-      variable_mgr_init_ops = [local_var_init_op]
-      if table_init_ops:
-        variable_mgr_init_ops.extend([table_init_ops])
-      with tf.control_dependencies([local_var_init_op]):
-        variable_mgr_init_ops.extend(self.variable_mgr.get_post_init_ops())
-      local_var_init_op_group = tf.group(*variable_mgr_init_ops)
-      summary_op = tf.summary.merge_all(scope=scope_name)
-      # The eval graph has no execution barrier because it doesn't run in
-      # distributed mode.
-      execution_barrier = None
-      # We do not use the global step during evaluation.
-      global_step = None
-      return GraphInfo(input_producer_op, enqueue_ops, fetches,
-                       execution_barrier, global_step, local_var_init_op_group,
-                       summary_op)
-  # TODO(reedwm): For consistency, we should have a similar
-  # "_initialize_train_graph" function. They can likely be the same function.
-  def _initialize_eval_graph(self, enqueue_ops, input_producer_op,
-                             local_var_init_op_group, sess):
-    """Initializes the evaluation graph.
-    Args:
-      enqueue_ops: Ops that adds the preprocessed images to the staging areas.
-      input_producer_op: Op that produce the input batches (before
-        preprocessing).
-      local_var_init_op_group: Group of ops that perform per-device
-        initialization work.
-      sess: The session to initialize the eval graph with.
-    Returns:
-      An ImageProducer, or None if an ImageProducer isn't being used.
-    """
-    with self._do_eval():
-      if local_var_init_op_group is not None:
-        # We might reinitialize local variables if they were already initialized
-        # during training. This is OK.
-        sess.run(local_var_init_op_group)
-      if self.dataset.queue_runner_required():
-        tf.train.start_queue_runners(sess=sess)
-      image_producer = None
-      if input_producer_op is not None:
-        image_producer = cnn_util.ImageProducer(
-            sess, input_producer_op, self.batch_group_size,
-            self.params.use_python32_barrier)
-        image_producer.start()
-      if enqueue_ops:
-        for i in xrange(len(enqueue_ops)):
-          sess.run(enqueue_ops[:(i + 1)])
-          if image_producer is not None:
-            image_producer.notify_image_consumption()
-      return image_producer
-  def _eval_once(self, sess, summary_writer, fetches, summary_op,
-                 image_producer, global_step):
-    """Evaluate the model using the validation dataset."""
-    with self._do_eval():
-      mlperf.logger.log_eval_epoch(
-          mlperf.tags.EVAL_START, global_step, self.batch_size)
-      loop_start_time = start_time = time.time()
-      # TODO(laigd): refactor the part to compute/report the accuracy. Currently
-      # it only works for image models.
-      top_1_accuracy_sum = 0.0
-      top_5_accuracy_sum = 0.0
-      total_eval_count = self.num_batches * self.batch_size
-      for step in xrange(self.num_batches):
-        if (summary_writer and self.params.save_summaries_steps > 0 and
-            (step + 1) % self.params.save_summaries_steps == 0):
-          results, summary_str = sess.run([fetches, summary_op])
-          summary_writer.add_summary(summary_str)
-        else:
-          results = sess.run(fetches)
-        # Make global_step available in results for postprocessing.
-        results['global_step'] = global_step
-        results = self.model.postprocess(results)
-        top_1_accuracy_sum += results['top_1_accuracy']
-        top_5_accuracy_sum += results['top_5_accuracy']
-        if (step + 1) % self.params.display_every == 0:
-          duration = time.time() - start_time
-          examples_per_sec = (
-              self.batch_size * self.params.display_every / duration)
-          log_fn('%i\t%.1f examples/sec' % (step + 1, examples_per_sec))
-          start_time = time.time()
-        if image_producer is not None:
-          image_producer.notify_image_consumption()
-      loop_end_time = time.time()
-      accuracy_at_1 = top_1_accuracy_sum / self.num_batches
-      accuracy_at_5 = top_5_accuracy_sum / self.num_batches
-      summary = tf.Summary()
-      summary.value.add(tag='eval/Accuracy@1', simple_value=accuracy_at_1)
-      summary.value.add(tag='eval/Accuracy@5', simple_value=accuracy_at_5)
-      for result_key, result_value in results.items():
-        if result_key.startswith(constants.SIMPLE_VALUE_RESULT_PREFIX):
-          prefix_len = len(constants.SIMPLE_VALUE_RESULT_PREFIX)
-          summary.value.add(tag='eval/' + result_key[prefix_len:],
-                            simple_value=result_value)
-      if summary_writer:
-        summary_writer.add_summary(summary, global_step)
-      log_fn('Accuracy @ 1 = %.4f Accuracy @ 5 = %.4f [%d examples]' %
-             (accuracy_at_1, accuracy_at_5, total_eval_count))
-      elapsed_time = loop_end_time - loop_start_time
-      images_per_sec = (self.num_batches * self.batch_size / elapsed_time)
-      if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
-        # Note that we compute the top 1 accuracy and top 5 accuracy for each
-        # batch, which will have a slight performance impact.
-        log_fn('-' * 64)
-        log_fn('total images/sec: %.2f' % images_per_sec)
-        log_fn('-' * 64)
-      if self.benchmark_logger:
-        eval_result = {
-            'eval_top_1_accuracy', accuracy_at_1,
-            'eval_top_5_accuracy', accuracy_at_5,
-            'eval_average_examples_per_sec', images_per_sec,
-            tf.GraphKeys.GLOBAL_STEP, global_step,
-        }
-        self.benchmark_logger.log_evaluation_result(eval_result)
-      mlperf.logger.log_eval_epoch(
-          mlperf.tags.EVAL_STOP, global_step, self.batch_size)
-      mlperf.logger.log(key=mlperf.tags.EVAL_SIZE,
-                        value=self.num_batches * self.batch_size)
-      if self.params.model != 'ssd300':  # ssd300 logs eval accuracy elsewhere.
-        mlperf.logger.log_eval_accuracy(
-            accuracy_at_1, global_step, self.train_batch_size,
-            examples_per_epoch=self.dataset.num_examples_per_epoch('train'))
-      if self.params.stop_at_top_1_accuracy:
-        mlperf.logger.log(key=mlperf.tags.EVAL_TARGET,
-                          value=self.params.stop_at_top_1_accuracy)
-      return accuracy_at_1, accuracy_at_5
-  def _benchmark_train(self):
-    """Run cnn in benchmark mode. Skip the backward pass if forward_only is on.
-    Returns:
-      Dictionary containing training statistics (num_workers, num_steps,
-      average_wall_time, images_per_sec).
-    """
-    graph = tf.Graph()
-    with graph.as_default():
-      build_result = self._build_graph()
-      if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
-        with self.variable_mgr.reuse_variables():
-          with tf.name_scope('Evaluation') as ns:
-            eval_build_results = self._build_eval_graph(ns)
-      else:
-        eval_build_results = None
-    (graph, result_to_benchmark) = self._preprocess_graph(graph, build_result)
-    with graph.as_default():
-      return self._benchmark_graph(result_to_benchmark, eval_build_results)
-  GPU_CACHED_INPUT_VARIABLE_NAME = 'gpu_cached_inputs'
-  def _unfreezable_local_variables(self, graph):
-    """Get the local variables that we don't want to freeze."""
-    return graph.get_collection(
-        tf.GraphKeys.LOCAL_VARIABLES,
-        # We don't freeze the gpu_cached_images local variable so it won't get
-        # constant folded with ops which process the input.
-        scope='.*' + BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME)
-  def _build_graph(self):
-    """Build the graph.
-    Returns:
-      A namedtuple containing the ops/tensors that required by
-      _benchmark_graph().
-    """
-    if self.params.variable_update == 'distributed_all_reduce':
-      self.single_session = True
-      (input_producer_op, enqueue_ops, fetches) = (
-          self._build_model_single_session())
-    else:
-      self.single_session = False
-      (input_producer_op, enqueue_ops, fetches) = self._build_model()
-    fetches_list = nest.flatten(list(fetches.values()))
-    main_fetch_group = tf.group(*fetches_list, name='main_fetch_group')
-    execution_barrier = None
-    if (not self.single_session and self.job_name and
-        not self.params.cross_replica_sync):
-      execution_barrier = self.add_sync_queues_and_barrier(
-          'execution_barrier_', [])
-    global_step = tf.train.get_global_step()
-    with tf.device(self.global_step_device), tf.name_scope('inc_global_step'):
-      with tf.control_dependencies([main_fetch_group]):
-        fetches['inc_global_step'] = global_step.assign_add(1)
-    if ((not self.single_session) and (not self.distributed_collective) and
-        self.job_name and self.params.cross_replica_sync):
-      # Block all replicas until all replicas are ready for next step.
-      fetches['sync_queues'] = self.add_sync_queues_and_barrier(
-          'sync_queues_step_end_', [main_fetch_group])
-    # Skips the init ops for freezable local variables in forward_only mode so
-    # we can remove all the assign ops when converting variables to constants.
-    with tf.name_scope('local_variable_initialization'):
-      if self.forward_only_and_freeze:
-        local_var_init_op = tf.variables_initializer(
-            self._unfreezable_local_variables(tf.get_default_graph()))
-      else:
-        local_var_init_op = tf.local_variables_initializer()
-    table_init_ops = tf.tables_initializer()
-    variable_manager_init_ops = [local_var_init_op]
-    if table_init_ops:
-      variable_manager_init_ops.extend([table_init_ops])
-    if not self.forward_only_and_freeze:
-      with tf.control_dependencies([local_var_init_op]):
-        variable_manager_init_ops.extend(self.variable_mgr.get_post_init_ops())
-    if ((not self.single_session) and (not self.distributed_collective) and
-        self.job_name and self.params.cross_replica_sync):
-      # Ensure all workers execute variable_manager_init_ops before they start
-      # executing the model.
-      variable_manager_init_ops.append(
-          self.add_sync_queues_and_barrier('init_ops_end_',
-                                           variable_manager_init_ops))
-    local_var_init_op_group = tf.group(*variable_manager_init_ops,
-                                       name='local_var_init_op_group')
-    summary_op = tf.summary.merge_all()
-    return GraphInfo(
-        input_producer_op=input_producer_op,
-        enqueue_ops=enqueue_ops,
-        fetches=fetches,
-        execution_barrier=execution_barrier,
-        global_step=global_step,
-        local_var_init_op_group=local_var_init_op_group,
-        summary_op=summary_op)
-  def _benchmark_graph(self, graph_info, eval_graph_info):
-    """Benchmark the training graph.
-    Args:
-      graph_info: the namedtuple returned by _build_graph() which
-        contains all necessary information to benchmark the graph, including
-        named tensors/ops list, fetches, etc.
-      eval_graph_info: Similar to graph_info but for the eval graph if
-        --eval_during_training_* is used. Otherwise, None.
-    Returns:
-      Dictionary containing training statistics (num_workers, num_steps,
-      average_wall_time, images_per_sec).
-    """
-    log_fn('Initializing graph')
-    if self.params.variable_update == 'horovod':
-      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-      # First worker will be 'chief' - it will write summaries and
-      # save checkpoints.
-      is_chief = hvd.rank() == 0
-    else:
-      is_chief = (not self.job_name or self.task_index == 0)
-    summary_writer = None
-    if (is_chief and self.params.summary_verbosity and self.params.train_dir and
-        self.params.save_summaries_steps > 0):
-      summary_writer = tf.summary.FileWriter(self.params.train_dir,
-                                             tf.get_default_graph())
-    # We want to start the benchmark timer right after a image_producer barrier
-    # and avoids undesired waiting times on barriers.
-    if ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
-        self.batch_group_size) != 0:
-      self.num_warmup_batches = int(
-          math.ceil(
-              (self.num_warmup_batches + len(graph_info.enqueue_ops) - 1.0) /
-              (self.batch_group_size)) * self.batch_group_size -
-          len(graph_info.enqueue_ops) + 1)
-      log_fn('Round up warm up steps to %d to match batch_group_size' %
-             self.num_warmup_batches)
-      assert ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
-              self.batch_group_size) == 0
-    # We run the summaries in the same thread as the training operations by
-    # passing in None for summary_op to avoid a summary_thread being started.
-    # Running summaries and training operations in parallel could run out of
-    # GPU memory.
-    if is_chief and not self.forward_only_and_freeze:
-      saver = tf.train.Saver(
-          self.variable_mgr.savable_variables(),
-          save_relative_paths=True,
-          max_to_keep=self.params.max_ckpts_to_keep)
-    else:
-      saver = None
-    ready_for_local_init_op = None
-    if self.job_name and not (self.single_session or
-                              self.distributed_collective):
-      # In distributed mode, we don't want to run local_var_init_op_group until
-      # the global variables are initialized, because local_var_init_op_group
-      # may use global variables (such as in distributed replicated mode). We
-      # don't set this in non-distributed mode, because in non-distributed mode,
-      # local_var_init_op_group may itself initialize global variables (such as
-      # in replicated mode).
-      ready_for_local_init_op = tf.report_uninitialized_variables(
-          tf.global_variables())
-    if self.params.variable_update == 'horovod':
-      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-      bcast_global_variables_op = hvd.broadcast_global_variables(0)
-    else:
-      bcast_global_variables_op = None
-    if self.params.variable_update == 'collective_all_reduce':
-      # It doesn't matter what this collective_graph_key value is,
-      # so long as it's > 0 and the same at every worker.
-      init_run_options = tf.RunOptions()
-      init_run_options.experimental.collective_graph_key = 6
-    else:
-      init_run_options = tf.RunOptions()
-    local_var_init_ops = [graph_info.local_var_init_op_group]
-    if eval_graph_info:
-      # `eval_graph_info.local_var_init_op_group` also includes some of the
-      # training initializer ops, since it's difficult to filter them out.
-      # Rerunning the training initializer ops is OK, but we add a control
-      # dependency since running two sets of training initializer ops at the
-      # same time can cause race conditions.
-      with tf.control_dependencies(local_var_init_ops):
-        local_var_init_ops.append(eval_graph_info.local_var_init_op_group)
-    sv = tf.train.Supervisor(
-        # For the purpose of Supervisor, all Horovod workers are 'chiefs',
-        # since we want session to be initialized symmetrically on all the
-        # workers.
-        is_chief=is_chief or (self.params.variable_update == 'horovod'
-                              or self.distributed_collective),
-        # Log dir should be unset on non-chief workers to prevent Horovod
-        # workers from corrupting each other's checkpoints.
-        logdir=self.params.train_dir if is_chief else None,
-        ready_for_local_init_op=ready_for_local_init_op,
-        local_init_op=local_var_init_ops,
-        saver=saver,
-        global_step=graph_info.global_step,
-        summary_op=None,
-        save_model_secs=self.params.save_model_secs,
-        summary_writer=summary_writer,
-        local_init_run_options=init_run_options)
-    profiler = tf.profiler.Profiler() if self.params.tfprof_file else None
-    if self.graph_file is not None:
-      path, filename = os.path.split(self.graph_file)
-      as_text = filename.endswith('txt')
-      log_fn('Writing GraphDef as %s to %s' % (  # pyformat break
-          'text' if as_text else 'binary', self.graph_file))
-      tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
-                           path, filename, as_text)
-    start_standard_services = (
-        self.params.train_dir or
-        self.dataset.queue_runner_required())
-    target = self.cluster_manager.get_target() if self.cluster_manager else ''
-    with sv.managed_session(
-        master=target,
-        config=create_config_proto(self.params),
-        start_standard_services=start_standard_services) as sess:
-      # Anything that can potentially raise an OutOfRangeError with 'sess' MUST
-      # be under this try block. The managed_session() context manager silently
-      # ignores OutOfRangeError, so we must catch them and wrap them with
-      # a different exception type so that they can be propagated up to the
-      # caller.
-      try:
-        stats = self.benchmark_with_session(
-            sess, sv, graph_info, eval_graph_info, bcast_global_variables_op,
-            is_chief, summary_writer, profiler)
-      except tf.errors.OutOfRangeError:
-        raise RuntimeError(
-            'Received OutOfRangeError. Wrapping in Runtime error to avoid '
-            'Supervisor from suppressing the error. Original OutOfRangeError '
-            'with traceback:\n' + traceback.format_exc())
-    sv.stop()
-    if profiler:
-      generate_tfprof_profile(profiler, self.params.tfprof_file)
-    return stats
-  def benchmark_with_session(self, sess, supervisor, graph_info,
-                             eval_graph_info, bcast_global_variables_op,
-                             is_chief, summary_writer, profiler):
-    """Benchmarks the graph with the given session.
-    Args:
-      sess: The session to benchmark the graph with
-      supervisor: The Supervisor that created the session.
-      graph_info: the namedtuple returned by _build_graph() which
-        contains all necessary information to benchmark the graph, including
-        named tensors/ops list, fetches, etc.
-      eval_graph_info: Similar to graph_info but for the eval graph if
-        --eval_during_training_every_n_steps is used. Otherwise, None.
-      bcast_global_variables_op: If Horovod is used, the op to broadcast the
-        global variables to all the processes. None if Horovod is not used.
-      is_chief: True if this is the chief process.
-      summary_writer: The SummaryWriter used to write summaries, or None if
-        summaries are not used.
-      profiler: The tf.profiler.Profiler, or None if tfprof is not used.
-    Returns:
-      Dictionary containing training statistics (num_workers, num_steps,
-      average_wall_time, images_per_sec).
-    """
-    if self.params.backbone_model_path is not None:
-      self.model.load_backbone_model(sess, self.params.backbone_model_path)
-    if bcast_global_variables_op:
-      sess.run(bcast_global_variables_op)
-    image_producer = None
-    if graph_info.input_producer_op is not None:
-      image_producer = cnn_util.ImageProducer(
-          sess, graph_info.input_producer_op, self.batch_group_size,
-          self.params.use_python32_barrier)
-      image_producer.start()
-    if graph_info.enqueue_ops:
-      for i in xrange(len(graph_info.enqueue_ops)):
-        sess.run(graph_info.enqueue_ops[:(i + 1)])
-        if image_producer is not None:
-          image_producer.notify_image_consumption()
-    self.init_global_step, = sess.run([graph_info.global_step])
-    if self.job_name and not self.params.cross_replica_sync:
-      # TODO(zhengxq): Do we need to use a global step watcher at all?
-      global_step_watcher = GlobalStepWatcher(
-          sess, graph_info.global_step,
-          self.num_workers * self.num_warmup_batches +
-          self.init_global_step,
-          self.num_workers * (self.num_warmup_batches + self.num_batches) - 1)
-      global_step_watcher.start()
-    else:
-      global_step_watcher = None
-    eval_image_producer = None
-    if eval_graph_info:
-      # We pass local_var_init_op_group=None because the Supervisor already
-      # initialized local variables above. We need to have the Supervisor
-      # initialize the local variables, because otherwise it throws an error
-      # complaining that not all variables were initialized.
-      eval_image_producer = self._initialize_eval_graph(
-          eval_graph_info.enqueue_ops, eval_graph_info.input_producer_op,
-          local_var_init_op_group=None, sess=sess)
-    step_train_times = []
-    log_fn('Running warm up')
-    local_step = -1 * self.num_warmup_batches
-    if self.single_session:
-      # In single session mode, each step, the global_step is incremented by
-      # 1. In non-single session mode, each step, the global_step is
-      # incremented once per worker. This means we need to divide
-      # init_global_step by num_workers only in non-single session mode.
-      end_local_step = self.num_batches - self.init_global_step
-    else:
-      end_local_step = self.num_batches - (self.init_global_step //
-                                           self.num_workers)
-    if not global_step_watcher:
-      # In cross-replica sync mode, all workers must run the same number of
-      # local steps, or else the workers running the extra step will block.
-      done_fn = lambda: local_step >= end_local_step
-    else:
-      done_fn = global_step_watcher.done
-    if self.params.debugger is not None:
-      if self.params.debugger == 'cli':
-        log_fn('The CLI TensorFlow debugger will be used.')
-        sess = tf_debug.LocalCLIDebugWrapperSession(sess)
-      else:
-        log_fn('The TensorBoard debugger plugin will be used.')
-        sess = tf_debug.TensorBoardDebugWrapperSession(sess,
-                                                       self.params.debugger)
-    mlperf.logger.log(key=mlperf.tags.TRAIN_LOOP)
-    skip_final_eval = False
-    accuracy_at_1 = None
-    accuracy_at_5 = None
-    last_eval_step = local_step
-    loop_start_time = time.time()
-    last_average_loss = None
-    while not done_fn():
-      if local_step == 0:
-        log_fn('Done warm up')
-        if graph_info.execution_barrier:
-          log_fn('Waiting for other replicas to finish warm up')
-          sess.run([graph_info.execution_barrier])
-        # TODO(laigd): rename 'Img' to maybe 'Input'.
-        header_str = ('Step\tImg/sec\t' +
-                      self.params.loss_type_to_report.replace('/', ' '))
-        if self.params.print_training_accuracy or self.params.forward_only:
-          # TODO(laigd): use the actual accuracy op names of the model.
-          header_str += '\ttop_1_accuracy\ttop_5_accuracy'
-        log_fn(header_str)
-        assert len(step_train_times) == self.num_warmup_batches
-        # reset times to ignore warm up batch
-        step_train_times = []
-        loop_start_time = time.time()
-      if (summary_writer and
-          (local_step + 1) % self.params.save_summaries_steps == 0):
-        fetch_summary = graph_info.summary_op
-      else:
-        fetch_summary = None
-      collective_graph_key = 7 if (
-          self.params.variable_update == 'collective_all_reduce') else 0
-      (summary_str, last_average_loss) = benchmark_one_step(
-          sess, graph_info.fetches, local_step,
-          self.batch_size * (self.num_workers
-                             if self.single_session else 1), step_train_times,
-          self.trace_filename, self.params.partitioned_graph_file_prefix,
-          profiler, image_producer, self.params, fetch_summary,
-          benchmark_logger=self.benchmark_logger,
-          collective_graph_key=collective_graph_key)
-      if summary_str is not None and is_chief:
-        supervisor.summary_computed(sess, summary_str)
-      local_step += 1
-      if (self.params.save_model_steps and
-          local_step % self.params.save_model_steps == 0 and
-          local_step > 0 and
-          is_chief):
-        supervisor.saver.save(sess, supervisor.save_path,
-                              supervisor.global_step)
-      if (eval_graph_info and local_step > 0 and not done_fn() and
-          self._should_eval_during_training(local_step)):
-        python_global_step = sess.run(graph_info.global_step)
-        num_steps_since_last_eval = local_step - last_eval_step
-        # The INPUT_SIZE tag value might not match the
-        # PREPROC_NUM_TRAIN_EXAMPLES tag value, because the number of examples
-        # run, which is INPUT_SIZE, is rounded up to the nearest multiple of
-        # self.batch_size.
-        mlperf.logger.log(
-            key=mlperf.tags.INPUT_SIZE,
-            value=num_steps_since_last_eval * self.batch_size)
-        log_fn('Running evaluation at global_step {}'.format(
-            python_global_step))
-        accuracy_at_1, accuracy_at_5 = self._eval_once(
-            sess, summary_writer, eval_graph_info.fetches,
-            eval_graph_info.summary_op, eval_image_producer,
-            python_global_step)
-        last_eval_step = local_step
-        if (self.params.stop_at_top_1_accuracy and
-            accuracy_at_1 >= self.params.stop_at_top_1_accuracy):
-          log_fn('Stopping, as eval accuracy at least %s was reached' %
-                 self.params.stop_at_top_1_accuracy)
-          skip_final_eval = True
-          break
-        else:
-          log_fn('Resuming training')
-      if eval_graph_info and self.model.reached_target():
-        log_fn('Stopping, as the model indicates its custom goal was reached')
-        skip_final_eval = True
-        break
-    loop_end_time = time.time()
-    # Waits for the global step to be done, regardless of done_fn.
-    if global_step_watcher:
-      while not global_step_watcher.done():
-        time.sleep(.25)
-    if not global_step_watcher:
-      elapsed_time = loop_end_time - loop_start_time
-      average_wall_time = elapsed_time / local_step if local_step > 0 else 0
-      images_per_sec = (self.num_workers * local_step * self.batch_size /
-                        elapsed_time)
-      num_steps = local_step * self.num_workers
-    else:
-      # NOTE: Each worker independently increases the global step. So,
-      # num_steps will be the sum of the local_steps from each worker.
-      num_steps = global_step_watcher.num_steps()
-      elapsed_time = global_step_watcher.elapsed_time()
-      average_wall_time = (elapsed_time * self.num_workers / num_steps
-                           if num_steps > 0 else 0)
-      images_per_sec = num_steps * self.batch_size / elapsed_time
-    # We skip printing images/sec if --eval_during_training_* is specified,
-    # because we are both processing training and evaluation images, so a
-    # singular "images/sec" value is meaningless.
-    if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
-      log_fn('-' * 64)
-      # TODO(laigd): rename 'images' to maybe 'inputs'.
-      log_fn('total images/sec: %.2f' % images_per_sec)
-      log_fn('-' * 64)
-    else:
-      log_fn('Done with training')
-    num_steps_since_last_eval = local_step - last_eval_step
-    mlperf.logger.log(
-        key=mlperf.tags.INPUT_SIZE,
-        value=num_steps_since_last_eval * self.batch_size)
-    python_global_step = sess.run(graph_info.global_step)
-    if eval_graph_info and not skip_final_eval:
-      log_fn('Running final evaluation at global_step {}'.format(
-          python_global_step))
-      accuracy_at_1, accuracy_at_5 = self._eval_once(
-          sess, summary_writer, eval_graph_info.fetches,
-          eval_graph_info.summary_op, eval_image_producer, python_global_step)
-    num_epochs_ran = (python_global_step * self.batch_size /
-                      self.dataset.num_examples_per_epoch('train'))
-    mlperf.logger.log_train_epochs(num_epochs_ran)
-    if image_producer is not None:
-      image_producer.done()
-    if eval_image_producer is not None:
-      eval_image_producer.done()
-    if is_chief:
-      if self.benchmark_logger:
-        self.benchmark_logger.log_metric(
-            'average_examples_per_sec', images_per_sec, global_step=num_steps)
-    # Save the model checkpoint.
-    if self.params.train_dir is not None and is_chief:
-      checkpoint_path = os.path.join(self.params.train_dir, 'model.ckpt')
-      if not gfile.Exists(self.params.train_dir):
-        gfile.MakeDirs(self.params.train_dir)
-      supervisor.saver.save(sess, checkpoint_path, graph_info.global_step)
-    if graph_info.execution_barrier:
-      # Wait for other workers to reach the end, so this worker doesn't
-      # go away underneath them.
-      sess.run([graph_info.execution_barrier])
-    stats = {
-        'num_workers': self.num_workers,
-        'num_steps': num_steps,
-        'average_wall_time': average_wall_time,
-        'images_per_sec': images_per_sec
-    }
-    if last_average_loss is not None:
-      stats['last_average_loss'] = last_average_loss
-    if accuracy_at_1 is not None:
-      stats['top_1_accuracy'] = accuracy_at_1
-    if accuracy_at_5 is not None:
-      stats['top_5_accuracy'] = accuracy_at_5
-    success = bool(self.model.reached_target() or
-                   (accuracy_at_1 and self.params.stop_at_top_1_accuracy and
-                    accuracy_at_1 >= self.params.stop_at_top_1_accuracy))
-    mlperf.logger.log(key=mlperf.tags.RUN_STOP, value={'success': success})
-    mlperf.logger.log(key=mlperf.tags.RUN_FINAL)
-    return stats
-  def _should_eval_during_training(self, step):
-    """Return True iff should run eval during training at current step."""
-    assert self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL
-    if self.params.eval_during_training_every_n_steps:
-      return step % self.params.eval_during_training_every_n_steps == 0
-    # All other --eval_during_training_* flags are converted to step numbers
-    # at which the model should run evaluation during training.
-    return step in self.eval_during_training_at_specified_steps
-  def _preprocess_graph(self, graph, graph_info):
-    """Preprocess the graph before executing.
-    Depending on the params, it runs various preprocessing on the graph,
-    including freezing, TensorRT conversion, etc.
-    Args:
-      graph: the graph to preprocess.
-      graph_info: the namedtuple returned by _build_graph() which
-        contains all necessary information to benchmark the graph, including
-        named tensors/ops list, fetches, etc.
-    Returns:
-      The updated graph and graph_info with the ops/tensors/fetches updated
-      according to the imported graph.
-    """
-    assert isinstance(graph_info.fetches, dict)
-    assert isinstance(graph_info.global_step, tf.Variable)
-    if not self.forward_only_and_freeze:
-      return (graph, graph_info)
-    # Get the names of the ops that need to keep during conversion.
-    flattened_op_names = list(
-        set([
-            v.name.split(':')[0]
-            for v in nest.flatten(graph_info)
-            if v is not None
-        ]))
-    # Get variables that we don't want to freeze.
-    # Only keep unfreezable variables in forward_only_and_freeze mode.
-    # TODO(laigd): consider making global_step a constant.
-    variables_to_keep = {graph_info.global_step: tf.GraphKeys.GLOBAL_VARIABLES}
-    variables_to_keep.update({
-        local_variable: tf.GraphKeys.LOCAL_VARIABLES
-        for local_variable in self._unfreezable_local_variables(graph)
-    })
-    variable_initializers = [
-        variable.initializer.name for variable in variables_to_keep]
-    output_node_names = (
-        flattened_op_names +
-        # Add variable initializer and read ops to the output list, so
-        # convert_variables_to_constants() will keep them.
-        variable_initializers +
-        [variable.value().op.name for variable in variables_to_keep])
-    graphdef = graph.as_graph_def(add_shapes=True)
-    # Freeze the graph.
-    with graph.as_default():
-      with tf.Session(config=create_config_proto(self.params)) as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        graphdef = graph_util.convert_variables_to_constants(
-            sess,
-            graphdef,
-            output_node_names,
-            variable_names_blacklist=[
-                variable.op.name for variable in variables_to_keep
-            ])
-    # Run TensorRT conversion.
-    if self.params.trt_mode:
-      # Import here instead of at top, because this will crash if TensorRT is
-      # not installed
-      from tensorflow.python.compiler.tensorrt import trt_convert  # pylint: disable=g-import-not-at-top
-      # Avoid TF-TRT bridge from touching all variable initializer ops and their
-      # dependencies, since they can directly be fetched by sess.run()s that
-      # initialize the variables.
-      # pylint: disable=protected-access
-      name_to_input_name, _, _ = graph_util_impl._extract_graph_summary(
-          graphdef)
-      initializer_subgraph_ops = graph_util_impl._bfs_for_reachable_nodes(
-          variable_initializers, name_to_input_name)
-      # pylint: enable=protected-access
-      graphdef = trt_convert.create_inference_graph(
-          graphdef,
-          outputs=output_node_names + list(initializer_subgraph_ops),
-          max_batch_size=self.model.get_batch_size(),
-          max_workspace_size_bytes=self.params.trt_max_workspace_size_bytes,
-          precision_mode=self.params.trt_mode)
-    # Creates a new graph as the default and import the converted graph back.
-    updated_graph = tf.Graph()
-    def _get_tensors_or_ops(inputs):
-      """Gets the updated tensors or ops from 'updated_graph'."""
-      def _get_fn(element):
-        if element is None:
-          return None
-        if ':' in element.name:
-          return updated_graph.get_tensor_by_name(element.name)
-        return updated_graph.get_operation_by_name(element.name)
-      if isinstance(inputs, (list, dict, tuple)):
-        return nest.map_structure(_get_fn, inputs)
-      else:
-        return _get_fn(inputs)
-    with updated_graph.as_default():
-      importer.import_graph_def(graph_def=graphdef, name='')
-      # Update the variables
-      for variable in variables_to_keep:
-        updated_variable = tf.Variable.from_proto(variable.to_proto())
-        tf.add_to_collection(variables_to_keep[variable], updated_variable)
-        if variable is graph_info.global_step:
-          updated_global_step = updated_variable
-    updated_graph_info = GraphInfo(
-        input_producer_op=_get_tensors_or_ops(graph_info.input_producer_op),
-        enqueue_ops=_get_tensors_or_ops(graph_info.enqueue_ops),
-        execution_barrier=_get_tensors_or_ops(graph_info.execution_barrier),
-        local_var_init_op_group=_get_tensors_or_ops(
-            graph_info.local_var_init_op_group),
-        fetches=_get_tensors_or_ops(graph_info.fetches),
-        global_step=updated_global_step,
-        summary_op=None)
-    return (updated_graph, updated_graph_info)
-  def _build_input_processing(self, shift_ratio=0):
-    """"Build the image (pre)processing portion of the model graph.
-    Args:
-      shift_ratio: shift_ratio for data_flow_ops.RecordInput.
-    Returns:
-      An InputProcessingInfo containing all the input sources to the model.
-    """
-    input_processing_info = InputProcessingInfo(
-        input_producer_op=None,
-        input_producer_stages=None,
-        multi_device_iterator_input=None)
-    mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
-    if not self._doing_eval:
-      mlperf.logger.log(key=mlperf.tags.INPUT_BATCH_SIZE, value=self.batch_size)
-    # If using synthetic gpu inputs, do nothing on the cpu side.
-    if self.dataset.use_synthetic_gpu_inputs():
-      assert not self.datasets_use_prefetch
-      return input_processing_info
-    if self._doing_eval:
-      input_preprocessor = self.eval_input_preprocessor
-      mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES,
-                        value=self.dataset.num_examples_per_epoch('validation'))
-    else:
-      input_preprocessor = self.input_preprocessor
-      mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_TRAIN_EXAMPLES,
-                        value=self.dataset.num_examples_per_epoch('train'))
-    # Use prefetching mechanism provided by dataset input pipeline.
-    if self.datasets_use_prefetch:
-      multi_device_iterator = (
-          input_preprocessor.build_multi_device_iterator(
-              self.batch_size, len(self.devices), self.cpu_device, self.params,
-              self.raw_devices, self.dataset, self._doing_eval))
-      return input_processing_info._replace(
-          multi_device_iterator_input=multi_device_iterator.get_next())
-    # Not using dataset prefetching. Use a staging area to mimic the prefetching
-    # behavior instead.
-    with tf.device(self.cpu_device):
-      if self._doing_eval:
-        subset = 'validation'
-      else:
-        subset = 'train'
-      input_list = input_preprocessor.minibatch(
-          self.dataset,
-          subset=subset,
-          params=self.params,
-          shift_ratio=shift_ratio)
-      input_producer_op = []
-      input_producer_stages = []
-      for device_num in range(len(self.devices)):
-        staging_area = data_flow_ops.StagingArea(
-            [parts[0].dtype for parts in input_list],
-            shapes=[parts[0].get_shape() for parts in input_list],
-            shared_name='input_producer_staging_area_%d_eval_%s' %
-            (device_num, self._doing_eval))
-        input_producer_stages.append(staging_area)
-        for group_index in xrange(self.batch_group_size):
-          batch_index = group_index + device_num * self.batch_group_size
-          put_op = staging_area.put(
-              [parts[batch_index] for parts in input_list])
-          input_producer_op.append(put_op)
-      assert input_producer_op
-    return input_processing_info._replace(
-        input_producer_op=input_producer_op,
-        input_producer_stages=input_producer_stages)
-  def _maybe_initialize_fp16(self):
-    """Initialize fp16 settings."""
-    if self.params.use_fp16 and not self._doing_eval:
-      init_loss_scale_val = float(self.params.fp16_loss_scale or
-                                  self.model.get_fp16_loss_scale())
-      self.loss_scale = None
-      self.loss_scale_normal_steps = None
-      if self.enable_auto_loss_scale or init_loss_scale_val != 1:
-        self.loss_scale = tf.get_variable(
-            name='loss_scale',
-            initializer=init_loss_scale_val,
-            dtype=tf.float32,
-            trainable=False)
-      if self.enable_auto_loss_scale:
-        self.loss_scale_normal_steps = tf.get_variable(
-            name='loss_scale_normal_steps', initializer=0, trainable=False)
-  def _build_model(self):
-    """Build the TensorFlow graph."""
-    if self.datasets_use_prefetch:
-      assert not self.params.staged_vars
-      assert not self.variable_mgr.supports_staged_vars()
-    # Adjust seed so different workers start read different input files.
-    if self.params.variable_update == 'horovod':
-      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-      seed_adjustment = hvd.rank()
-    else:
-      seed_adjustment = 0
-    mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
-                      value=self.params.tf_random_seed + seed_adjustment)
-    tf.set_random_seed(self.params.tf_random_seed + seed_adjustment)
-    mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
-                      value=4321 + seed_adjustment)
-    np.random.seed(4321 + seed_adjustment)
-    phase_train = not (self._doing_eval or self.params.forward_only)
-    if self._doing_eval:
-      mode_string = 'evaluation'
-    else:
-      mode_string = 'training'
-    log_fn('Generating {} model'.format(mode_string))
-    losses = []
-    device_grads = []
-    all_logits = []
-    all_accuracy_ops = {}
-    gpu_compute_stage_ops = []
-    gpu_grad_stage_ops = []
-    with tf.device(self.global_step_device):
-      global_step = tf.train.get_or_create_global_step()
-      self._maybe_initialize_fp16()
-    # Build the processing and model for the worker.
-    input_producer_op = None
-    with tf.name_scope('input_processing'):
-      input_processing_info = self._build_input_processing(shift_ratio=0)
-      if input_processing_info.input_producer_op is not None:
-        input_producer_op = tf.group(*input_processing_info.input_producer_op)
-    update_ops = None
-    staging_delta_ops = []
-    for device_num in range(len(self.devices)):
-      with tf.name_scope('tower_%i' % device_num) as name_scope, (
-          self.variable_mgr.create_outer_variable_scope(device_num)):
-        results = self.add_forward_pass_and_gradients(
-            phase_train, device_num, device_num, input_processing_info,
-            gpu_compute_stage_ops, gpu_grad_stage_ops)
-        if self.params.backbone_model_path:
-          self.model.add_backbone_saver()
-        if phase_train:
-          losses.append(results['loss'])
-          device_grads.append(results['gradvars'])
-        else:
-          all_logits.append(results['logits'])
-        if not phase_train or self.params.print_training_accuracy:
-          for name, op in results.items():
-            if name.startswith('accuracy:'):
-              key = name[9:]
-              if key not in all_accuracy_ops:
-                all_accuracy_ops[key] = []
-              all_accuracy_ops[key].append(op)
-        if device_num == 0:
-          # Retain the Batch Normalization updates operations only from the
-          # first tower. These operations update the moving mean and moving
-          # variance variables, which are updated (but not used) during
-          # training, and used during evaluation. The moving mean and variance
-          # approximate the true mean and variance across all images in the
-          # dataset. Therefore, in replicated mode, these moving averages would
-          # be almost identical for each tower, and so we only update and save
-          # the moving averages for one tower. In parameter server mode, all
-          # towers share a copy of the variables so we also only need to update
-          # and save the moving averages once.
-          update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)
-          if self.datasets_use_prefetch:
-            assert not self.variable_mgr.staging_delta_ops
-          else:
-            staging_delta_ops = list(self.variable_mgr.staging_delta_ops)
-    enqueue_ops = []
-    if not self.datasets_use_prefetch:
-      if self.variable_mgr.supports_staged_vars():
-        for staging_ops in self.variable_mgr.staging_vars_on_devices:
-          gpu_compute_stage_ops.extend(
-              [put_op for _, (put_op, _) in six.iteritems(staging_ops)])
-      enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
-                                  name='gpu_compute_stage_ops_group'))
-      if gpu_grad_stage_ops:
-        staging_delta_ops += gpu_grad_stage_ops
-      if staging_delta_ops:
-        enqueue_ops.append(tf.group(*(staging_delta_ops)))
-    if (self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL and
-        self.params.variable_update == 'replicated'):
-      # We need to get all the update ops instead of only those for the first
-      # tower. This is because during evaluation, each tower will read from its
-      # own tower's moving averages instead of the first tower's moving
-      # averages.
-      # TODO(reedwm): Have each tower read from the first tower's moving
-      # averages for a slight performance gain.
-      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-      mlperf.logger.log(key=mlperf.tags.INPUT_BN_SPAN,
-                        value=self.batch_size // len(self.raw_devices))
-    fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
-                                  enqueue_ops, update_ops, all_accuracy_ops,
-                                  phase_train)
-    return (input_producer_op, enqueue_ops, fetches)
-  def _build_fetches(self, global_step, all_logits, losses, device_grads,
-                     enqueue_ops, update_ops, all_accuracy_ops, phase_train):
-    """Complete construction of model graph, populating the fetches map."""
-    fetches = {}
-    if enqueue_ops:
-      fetches['enqueue_ops'] = enqueue_ops
-    for name, ops in all_accuracy_ops.items():
-      # For fetches that starts with 'tensor:', keep dimension and skip reducing
-      # them to scalars.
-      if name.startswith(constants.UNREDUCED_ACCURACY_OP_PREFIX):
-        key = name[len(constants.UNREDUCED_ACCURACY_OP_PREFIX):]
-        fetches[key] = tf.concat(ops, 0)
-      else:
-        fetches[name] = tf.reduce_sum(ops) / self.batch_size
-        if self.task_index == 0 and self.params.summary_verbosity >= 1:
-          tf.summary.scalar(name, fetches[name])
-    if not phase_train:
-      if self.params.forward_only:
-        fetches['all_logits'] = tf.concat(all_logits, 0)
-      return fetches
-    apply_gradient_devices, gradient_state = (
-        self.variable_mgr.preprocess_device_grads(device_grads))
-    # TODO(reedwm): Greatly simplify the learning rate code.
-    if (self.params.variable_update == 'horovod' or
-        self.params.variable_update == 'collective_all_reduce'):
-      # Each worker independently increments global_step.
-      examples_per_step = self.batch_size * self.num_workers
-    else:
-      # global_step is shared by all workers, and so every iteration
-      # global_step is incremented by num_workers.
-      examples_per_step = self.batch_size
-    if self.params.compute_lr_on_cpu:
-      with tf.device(self.cpu_device):
-        learning_rate = get_learning_rate(self.params, global_step,
-                                          self.dataset.num_examples_per_epoch(),
-                                          self.model, examples_per_step)
-    training_ops = []
-    for d, device in enumerate(apply_gradient_devices):
-      with tf.device(device):
-        with tf.name_scope('average_loss'):
-          average_loss = tf.reduce_mean(losses)
-        with tf.name_scope('get_gradients_to_apply'):
-          avg_grads = self.variable_mgr.get_gradients_to_apply(d,
-                                                               gradient_state)
-        if not self.params.compute_lr_on_cpu:
-          # We compute the learning rate once for each device in
-          # `apply_gradient_devices`.
-          learning_rate = get_learning_rate(
-              self.params, global_step, self.dataset.num_examples_per_epoch(),
-              self.model, examples_per_step)
-        gradient_clip = self.params.gradient_clip
-        if gradient_clip is not None:
-          with tf.name_scope('clip_gradients'):
-            clipped_grads = [(tf.clip_by_value(grad, -gradient_clip,
-                                               +gradient_clip), var)
-                             for grad, var in avg_grads]
-        else:
-          clipped_grads = avg_grads
-        learning_rate = tf.identity(learning_rate, name='learning_rate_tensor')
-        opt = get_optimizer(self.params, learning_rate)
-        loss_scale_params = variable_mgr_util.AutoLossScaleParams(
-            enable_auto_loss_scale=self.enable_auto_loss_scale,
-            loss_scale=self.loss_scale,
-            loss_scale_normal_steps=self.loss_scale_normal_steps,
-            inc_loss_scale_every_n=self.params.fp16_inc_loss_scale_every_n,
-            is_chief=not self.job_name or self.task_index == 0)
-        with tf.name_scope('append_apply_gradient_ops'):
-          self.variable_mgr.append_apply_gradients_ops(
-              gradient_state, opt, clipped_grads, training_ops,
-              loss_scale_params)
-    train_op = tf.group(*(training_ops + update_ops), name='train_ops_group')
-    with tf.device(self.cpu_device):
-      if self.task_index == 0 and self.params.summary_verbosity >= 1:
-        tf.summary.scalar('learning_rate', learning_rate)
-        tf.summary.scalar(self.params.loss_type_to_report, average_loss)
-        if self.loss_scale is not None:
-          tf.summary.scalar('loss_scale', self.loss_scale)
-        if self.loss_scale_normal_steps:
-          tf.summary.scalar('loss_scale_normal_steps',
-                            self.loss_scale_normal_steps)
-        if self.params.summary_verbosity >= 2:
-          self.gradient_histogram_summary(avg_grads)
-        if self.params.summary_verbosity >= 3:
-          for grad, var in avg_grads:
-            if grad is not None:
-              tf.summary.histogram(var.op.name + '/gradients', grad)
-          for var in tf.trainable_variables():
-            tf.summary.histogram(var.op.name, var)
-    fetches['train_op'] = train_op
-    fetches['average_loss'] = average_loss
-    return fetches
-  def gradient_histogram_summary(self, avg_grads):
-    """Create histogram of log values of all non-zero gradients."""
-    with tf.name_scope('log_gradients_summary'):
-      all_grads = []
-      for grad, _ in avg_grads:
-        all_grads.append(tf.reshape(grad, [-1]))
-      grads = tf.abs(tf.concat(all_grads, 0))
-      # exclude grads with zero values.
-      indices_for_non_zero_grads = tf.where(tf.not_equal(grads, 0))
-      log_grads = tf.reshape(
-          tf.log(tf.gather(grads, indices_for_non_zero_grads)), [-1])
-      tf.summary.histogram('log_gradients', log_grads)
-  def _build_model_single_session(self):
-    """Build the TensorFlow graph for multiple replicas in a single_session.
-    Returns:
-      input_producer_op:
-      enqueue_ops:
-      fetches:
-    Raises:
-       ValueError: optimizer not recognized.
-    Single session runs multiple model replicas as part of one large
-    distributed graph, whose global execution is always step-synchronized.
-    """
-    # verify assumptions
-    assert self.params.task_index == 0
-    assert not self._doing_eval
-    assert not self.params.forward_only
-    assert not self.params.staged_vars
-    tf.set_random_seed(self.params.tf_random_seed)
-    np.random.seed(4321)
-    phase_train = True
-    log_fn('Generating training model')
-    losses = []
-    device_grads = []
-    all_logits = []
-    all_accuracy_ops = {}
-    gpu_compute_stage_ops = []
-    gpu_grad_stage_ops = []
-    with tf.device(self.global_step_device):
-      global_step = tf.train.get_or_create_global_step()
-    update_ops = []
-    global_input_producer_op = []
-    is_local = not self.job_name
-    if is_local:
-      assert self.num_workers == 1
-    for task_num in range(self.num_workers):
-      # Reset the devices that self.variable_mgr knows about to those
-      # belonging to the next worker (task).
-      self.reset_devices_for_task(task_num, is_local)
-      # Build the per-worker image processing
-      with tf.name_scope('input_processing'):
-        input_processing_info = self._build_input_processing(
-            shift_ratio=(task_num / self.num_workers))
-      if input_processing_info.input_producer_op is not None:
-        global_input_producer_op.extend(input_processing_info.input_producer_op)
-      # Build the per-worker model replica.
-      for rel_device_num in range(len(self.devices)):
-        abs_device_num = task_num * len(self.devices) + rel_device_num
-        with self.variable_mgr.create_outer_variable_scope(
-            abs_device_num), tf.name_scope(
-                'task_%i_tower_%i' % (task_num, rel_device_num)) as name_scope:
-          task_results = self.add_forward_pass_and_gradients(
-              phase_train, rel_device_num, abs_device_num,
-              input_processing_info, gpu_compute_stage_ops, gpu_grad_stage_ops)
-          if self.params.backbone_model_path:
-            self.model.add_backbone_saver()
-          if phase_train:
-            losses.append(task_results['loss'])
-            device_grads.append(task_results['gradvars'])
-          else:
-            all_logits.append(task_results['logits'])
-          if not phase_train or self.params.print_training_accuracy:
-            for name, op in task_results.items():
-              if name.startswith('accuracy:'):
-                key = name[9:]
-                if key not in all_accuracy_ops:
-                  all_accuracy_ops[key] = []
-                all_accuracy_ops[key].append(op)
-          if rel_device_num == 0:
-            # Retain the Batch Normalization updates operations only
-            # from the first tower. These operations update the moving
-            # mean and moving variance variables, which are updated
-            # (but not used) during training, and used during
-            # evaluation. The moving mean and variance approximate the
-            # true mean and variance across all images in the
-            # dataset. Therefore, in replicated mode, these moving
-            # averages would be almost identical for each tower, and
-            # so we only update and save the moving averages for one
-            # tower. In parameter server mode, all towers share a copy
-            # of the variables so we also only need to update and save
-            # the moving averages once.
-            update_ops.extend(
-                tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope))
-            assert not self.variable_mgr.staging_delta_ops
-    enqueue_ops = []
-    if gpu_compute_stage_ops:
-      enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
-                                  name='gpu_compute_stage_ops'))
-    assert not self.variable_mgr.supports_staged_vars()
-    assert not gpu_grad_stage_ops
-    fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
-                                  enqueue_ops, update_ops, all_accuracy_ops,
-                                  phase_train)
-    if global_input_producer_op:
-      global_input_producer_op = tf.group(*global_input_producer_op)
-    else:
-      global_input_producer_op = None
-    return (global_input_producer_op, enqueue_ops, fetches)
-  def add_forward_pass_and_gradients(self,
-                                     phase_train,
-                                     rel_device_num,
-                                     abs_device_num,
-                                     input_processing_info,
-                                     gpu_compute_stage_ops,
-                                     gpu_grad_stage_ops):
-    """Add ops for forward-pass and gradient computations."""
-    nclass = self.dataset.num_classes
-    if self.datasets_use_prefetch:
-      assert input_processing_info.multi_device_iterator_input, (
-          'multi_device_iterator_input cannot be None if '
-          'datasets_use_prefetch=True')
-      input_list = (
-          input_processing_info.multi_device_iterator_input[rel_device_num])
-    else:
-      if not self.dataset.use_synthetic_gpu_inputs():
-        input_producer_stage = input_processing_info.input_producer_stages[
-            rel_device_num]
-        with tf.device(self.cpu_device):
-          host_input_list = input_producer_stage.get()
-        with tf.device(self.raw_devices[rel_device_num]):
-          gpu_compute_stage = data_flow_ops.StagingArea(
-              [inp.dtype for inp in host_input_list],
-              shapes=[inp.get_shape() for inp in host_input_list])
-          # The CPU-to-GPU copy is triggered here.
-          gpu_compute_stage_op = gpu_compute_stage.put(host_input_list)
-          input_list = gpu_compute_stage.get()
-          gpu_compute_stage_ops.append(gpu_compute_stage_op)
-      else:
-        with tf.device(self.raw_devices[rel_device_num]):
-          # Minor hack to avoid H2D copy when using synthetic data
-          input_list = self.model.get_synthetic_inputs(
-              BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME, nclass)
-    # Labels reshaping happens all on gpu:0. Reshaping synthetic labels on
-    # multiple devices slows down XLA computation for an unknown reason.
-    # TODO(b/116875203): Find/address root cause of XLA slow down.
-    labels_device_placement_hack = (
-        self.dataset.use_synthetic_gpu_inputs() and self.params.xla_compile)
-    def device_aware_reshape(tensor, shape):
-      device = self.devices[rel_device_num]
-      # Labels are int32, place reshapes on gpu:0 (no device placement) when the
-      # hack is enabled.
-      if labels_device_placement_hack and tensor.dtype == tf.int32:
-        device = ''
-      with tf.device(device):
-        return tf.reshape(tensor, shape=shape)
-    subset = 'validation' if self._doing_eval else 'train'
-    input_shapes = self.model.get_input_shapes(subset)
-    input_list = [
-        device_aware_reshape(input_list[i], shape=input_shapes[i])
-        for i in range(len(input_list))
-    ]
-    def forward_pass_and_gradients():
-      """Builds forward pass and gradient computation network.
-      When phase_train=True and print_training_accuracy=False:
-        return [loss] + grads
-      When phase_train=True and print_training_accuracy=True:
-        return [logits, loss] + grads
-      When phase_train=False,
-        return [logits]
-      Its output can always be unpacked by
-      ```
-        outputs = forward_pass_and_gradients()
-        logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
-      ```
-      Returns:
-        outputs: A list of tensors depending on different modes.
-      """
-      build_network_result = self.model.build_network(
-          input_list, phase_train, nclass)
-      logits = build_network_result.logits
-      if not phase_train:
-        return [logits]
-      base_loss = self.model.loss_function(input_list, build_network_result)
-      params = self.variable_mgr.trainable_variables_on_device(
-          rel_device_num, abs_device_num)
-      l2_loss = None
-      total_loss = base_loss
-      with tf.name_scope('l2_loss'):
-        fp32_params = params
-        if self.model.data_type == tf.float16 and self.params.fp16_vars:
-          # fp16 reductions are very slow on GPUs, so cast to fp32 before
-          # calling tf.nn.l2_loss and tf.add_n.
-          # TODO(b/36217816): Once the bug is fixed, investigate if we should do
-          # this reduction in fp16.
-          fp32_params = (tf.cast(p, tf.float32) for p in params)
-        filtered_params = self.model.filter_l2_loss_vars(fp32_params)
-        if rel_device_num == len(self.devices) - 1:
-          # We compute the L2 loss for only one device instead of all of them,
-          # because the L2 loss for each device is the same. To adjust for this,
-          # we multiply the L2 loss by the number of devices. We choose the
-          # last device because for some reason, on a Volta DGX1, the first four
-          # GPUs take slightly longer to complete a step than the last four.
-          # TODO(reedwm): Shard the L2 loss computations across GPUs.
-          if self.params.single_l2_loss_op:
-            # TODO(reedwm): If faster, create a fused op that does the L2 loss
-            # on multiple tensors, and use that instead of concatenating
-            # tensors.
-            reshaped_params = [tf.reshape(p, (-1,)) for p in filtered_params]
-            l2_loss = tf.nn.l2_loss(tf.concat(reshaped_params, axis=0))
-          else:
-            l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in filtered_params])
-      weight_decay = self.params.weight_decay
-      mlperf.logger.log(key=mlperf.tags.OPT_WEIGHT_DECAY, value=weight_decay)
-      if (weight_decay is not None and weight_decay != 0. and
-          l2_loss is not None):
-        mlperf.logger.log(key=mlperf.tags.MODEL_L2_REGULARIZATION,
-                          value=weight_decay)
-        total_loss += len(self.devices) * weight_decay * l2_loss
-      aggmeth = tf.AggregationMethod.DEFAULT
-      scaled_loss = (total_loss if self.loss_scale is None
-                     else total_loss * self.loss_scale)
-      grads = tf.gradients(scaled_loss, params, aggregation_method=aggmeth)
-      if self.params.sparse_to_dense_grads:
-        # Passing a sparse gradient to convert_to_tensor turns it into a dense
-        # gradient. A sparse gradient is an instance of tf.IndexedSlices.
-        # convert_to_tensor does not modify dense tensors.
-        grads = [tf.convert_to_tensor(g) for g in grads]
-      if self.loss_scale is not None:
-        # TODO(reedwm): If automatic loss scaling is not used, we could avoid
-        # these multiplications by directly modifying the learning rate instead.
-        # If this is done, care must be taken to ensure that this scaling method
-        # is correct, as some optimizers square gradients and do other
-        # operations which might not be compatible with modifying both the
-        # gradients and the learning rate.
-        grads = [
-            grad * tf.cast(1. / self.loss_scale, grad.dtype) for grad in grads
-        ]
-      if self.params.variable_update == 'horovod':
-        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-        if self.params.horovod_device:
-          horovod_device = '/%s:0' % self.params.horovod_device
-        else:
-          horovod_device = ''
-        # All-reduce gradients using Horovod.
-        grads = [hvd.allreduce(grad, average=False, device_dense=horovod_device)
-                 for grad in grads]
-      if self.params.staged_vars:
-        grad_dtypes = [grad.dtype for grad in grads]
-        grad_shapes = [grad.shape for grad in grads]
-        grad_stage = data_flow_ops.StagingArea(grad_dtypes, grad_shapes)
-        grad_stage_op = grad_stage.put(grads)
-        # In general, this decouples the computation of the gradients and
-        # the updates of the weights.
-        # During the pipeline warm up, this runs enough training to produce
-        # the first set of gradients.
-        gpu_grad_stage_ops.append(grad_stage_op)
-        grads = grad_stage.get()
-      if self.params.loss_type_to_report == 'total_loss':
-        loss = total_loss
-      else:
-        loss = base_loss
-      if self.params.print_training_accuracy:
-        return [logits, loss] + grads
-      else:
-        return [loss] + grads
-    def unpack_forward_pass_and_gradients_output(forward_pass_and_grad_outputs):
-      """Unpacks outputs from forward_pass_and_gradients.
-      Args:
-        forward_pass_and_grad_outputs: Output from forward_pass_and_gradients.
-      Returns:
-        logits: Unscaled probability distribution from forward pass.
-          If unavailable, None is returned.
-        loss: Loss function result from logits.
-          If unavailable, None is returned.
-        grads: Gradients for all trainable variables.
-          If unavailable, None is returned.
-      """
-      logits = None
-      # logits is only fetched in non-train mode or when
-      # print_training_accuracy is set.
-      if not phase_train or self.params.print_training_accuracy:
-        logits = forward_pass_and_grad_outputs.pop(0)
-      loss = (
-          forward_pass_and_grad_outputs[0]
-          if forward_pass_and_grad_outputs else None)
-      grads = (
-          forward_pass_and_grad_outputs[1:]
-          if forward_pass_and_grad_outputs else None)
-      return logits, loss, grads
-    def make_results(logits, loss, grads):
-      """Generate results based on logits, loss and grads."""
-      results = {}  # The return value
-      if logits is not None:
-        results['logits'] = logits
-        accuracy_ops = self.model.accuracy_function(input_list, logits)
-        for name, op in accuracy_ops.items():
-          results['accuracy:' + name] = op
-      if loss is not None:
-        results['loss'] = loss
-      if grads is not None:
-        param_refs = self.variable_mgr.trainable_variables_on_device(
-            rel_device_num, abs_device_num, writable=True)
-        results['gradvars'] = list(zip(grads, param_refs))
-      return results
-    with tf.device(self.devices[rel_device_num]):
-      outputs = maybe_compile(forward_pass_and_gradients, self.params)
-      logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
-      return make_results(logits, loss, grads)
-  def get_input_preprocessor(self):
-    """Returns the image preprocessor to used, based on the model.
-    Returns:
-      The image preprocessor, or None if synthetic data should be used.
-    """
-    shift_ratio = 0
-    if self.job_name:
-      # shift_ratio prevents multiple workers from processing the same batch
-      # during a step
-      shift_ratio = self.task_index / self.num_workers
-    processor_class = self.dataset.get_input_preprocessor(
-        self.params.input_preprocessor)
-    assert processor_class
-    subset = 'validation' if self._doing_eval else 'train'
-    return processor_class(
-        self.batch_size * self.batch_group_size,
-        self.model.get_input_shapes(subset),
-        len(self.devices) * self.batch_group_size,
-        dtype=self.model.data_type,
-        train=(not self._doing_eval),
-        # TODO(laigd): refactor away image model specific parameters.
-        distortions=self.params.distortions,
-        resize_method=self.resize_method,
-        shift_ratio=shift_ratio,
-        summary_verbosity=self.params.summary_verbosity,
-        distort_color_in_yiq=self.params.distort_color_in_yiq,
-        fuse_decode_and_crop=self.params.fuse_decode_and_crop,
-        match_mlperf=self.params.ml_perf)
-  def add_sync_queues_and_barrier(self, name_prefix, enqueue_after_list):
-    """Adds ops to enqueue on all worker queues.
-    Args:
-      name_prefix: prefixed for the shared_name of ops.
-      enqueue_after_list: control dependency from ops.
-    Returns:
-      An op that should be used as control dependency before starting next step.
-    """
-    self.sync_queue_counter += 1
-    with tf.device(self.sync_queue_devices[(
-        self.sync_queue_counter % len(self.sync_queue_devices))]):
-      sync_queues = [
-          tf.FIFOQueue(self.num_workers, [tf.bool], shapes=[[]],
-                       shared_name='%s%s' % (name_prefix, i))
-          for i in range(self.num_workers)]
-      queue_ops = []
-      # For each other worker, add an entry in a queue, signaling that it can
-      # finish this step.
-      token = tf.constant(False)
-      with tf.control_dependencies(enqueue_after_list):
-        for i, q in enumerate(sync_queues):
-          if i == self.task_index:
-            queue_ops.append(tf.no_op())
-          else:
-            queue_ops.append(q.enqueue(token))
-      # Drain tokens off queue for this worker, one for each other worker.
-      queue_ops.append(
-          sync_queues[self.task_index].dequeue_many(len(sync_queues) - 1))
-      return tf.group(*queue_ops)
-def _is_mkl_flag_absent(mkl_flag):
-  return not (absl_flags.FLAGS.is_parsed() and mkl_flag in absl_flags.FLAGS
-              and absl_flags.FLAGS[mkl_flag].present)
-def _print_os_env_ignored_warning(mkl_flag, flag_default_val, os_env_var):
-  tf.logging.warn(
-      ('OS ENV variable %s=%s is ignored and script default: '
-       '%s is used. Use --%s to override.') %
-      (os_env_var, os.environ[os_env_var], flag_default_val, mkl_flag))
-def set_default_param_values_and_env_vars(params):
-  """Sets up the default param values and environment variables ."""
-  if params.batchnorm_persistent:
-    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
-  else:
-    os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
-  if params.winograd_nonfused:
-    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
-  else:
-    os.environ.pop('TF_ENABLE_WINOGRAD_NONFUSED', None)
-  if params.autotune_threshold:
-    os.environ['TF_AUTOTUNE_THRESHOLD'] = str(params.autotune_threshold)
-  os.environ['TF_SYNC_ON_FINISH'] = str(int(params.sync_on_finish))
-  argparse.ArgumentParser(
-      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-  # Sets environment variables for MKL
-  # If OS ENV vars are overridden by script defaults, a warning msg is printed.
-  if params.mkl:
-    mkl_flags = ['kmp_blocktime', 'kmp_settings', 'kmp_affinity',
-                 'num_intra_threads']
-    for mkl_flag in mkl_flags:
-      os_env_var = mkl_flag.upper()
-      if mkl_flag == 'num_intra_threads':
-        os_env_var = 'OMP_NUM_THREADS'
-      flag_val = str(getattr(params, mkl_flag))
-      if _is_mkl_flag_absent(mkl_flag) and os_env_var in os.environ:
-        _print_os_env_ignored_warning(mkl_flag, flag_val, os_env_var)
-      os.environ[os_env_var] = flag_val
-      if mkl_flag == 'num_intra_threads' and not params.num_intra_threads:
-        os.environ.pop(os_env_var, None)
-  # Sets GPU thread settings
-  if params.device.lower() == 'gpu':
-    params = params._replace(gpu_thread_mode=params.gpu_thread_mode.lower())
-    if params.gpu_thread_mode not in ['global', 'gpu_shared', 'gpu_private']:
-      raise ValueError('Invalid gpu_thread_mode: %s' % params.gpu_thread_mode)
-    os.environ['TF_GPU_THREAD_MODE'] = params.gpu_thread_mode
-    if params.per_gpu_thread_count and params.gpu_thread_mode == 'global':
-      raise ValueError(
-          'Invalid per_gpu_thread_count with gpu_thread_mode=global: %s' %
-          params.per_gpu_thread_count)
-    # Default to two threads. One for the device compute and the other for
-    # memory copies.
-    per_gpu_thread_count = params.per_gpu_thread_count or 2
-    total_gpu_thread_count = per_gpu_thread_count * params.num_gpus
-    if params.gpu_thread_mode == 'gpu_private':
-      os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
-    elif params.gpu_thread_mode == 'gpu_shared':
-      os.environ['TF_GPU_THREAD_COUNT'] = str(total_gpu_thread_count)
-    cpu_count = multiprocessing.cpu_count()
-    if not params.num_inter_threads and params.gpu_thread_mode in [
-        'gpu_private', 'gpu_shared'
-    ]:
-      main_thread_count = max(cpu_count - total_gpu_thread_count, 1)
-      params = params._replace(num_inter_threads=main_thread_count)
-    if (params.datasets_use_prefetch and
-        params.datasets_num_private_threads is None):
-      # From the total cpu thread count, subtract the total_gpu_thread_count,
-      # and then 2 threads per GPU device for event monitoring and sending /
-      # receiving tensors
-      num_monitoring_threads = 2 * params.num_gpus
-      num_private_threads = max(
-          cpu_count - total_gpu_thread_count - num_monitoring_threads, 1)
-      params = params._replace(datasets_num_private_threads=num_private_threads)
-  return params
-def setup(params):
-  """Sets up the environment that BenchmarkCNN should run in.
-  Args:
-    params: Params tuple, typically created by make_params or
-      make_params_from_flags.
-  Returns:
-    A potentially modified params.
-  Raises:
-    ValueError: invalid parames combinations.
-  """
-  # Set up environment variables before doing any other global initialization to
-  # make sure it uses the appropriate environment variables.
-  params = set_default_param_values_and_env_vars(params)
-  # horovod needs to be initialized before create_config_proto() call since
-  # it will be used in config generation if enabled.
-  if params.variable_update == 'horovod':
-    import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-    hvd.init()
-  platforms_util.initialize(params, create_config_proto(params))
-  if not params.job_name:
-    # Create a dummy session to initialize TF global variables using the input
-    # params. Otherwise, ListDevices function may create global devices using
-    # the default config instead of using the user provided config.
-    #
-    # TODO(hinsu): Find a way to achieve the same for distributed benchmark. It
-    # is not legal to create distributed session after local session. It is also
-    # not possible to create distributed session here as that results in
-    # multiple creation of ClusterManager and Server.
-    with tf.Session(config=create_config_proto(params)) as sess:
-      del sess
-  return params
-def maybe_compile(computation, params):
-  if params and params.xla_compile:
-    return tf.xla.experimental.compile(computation)
-  else:
-    return computation()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests running benchmark_cnn in distributed mode.
-This is done by spawning one process per task. Each process runs
-benchmark_cnn_distributed_test_runner.py.
-The output for each process is written to disk and can be viewed to debug tests.
-See get_test_output_dir() in platforms/default/util.py for more info.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from collections import namedtuple
-import os
-import subprocess
-import time
-import unittest
-from absl import flags as absl_flags
-import portpicker
-import six
-import tensorflow.compat.v1 as tf
-import flags
-import test_util
-from platforms import util as platforms_util
-FLAGS = absl_flags.FLAGS
-def _convert_params_to_flags_list(params):
-  """Converts Params to a list of flags. Skips default-valued parameters.
-  E.g., converts
-    benchmark_cnn.make_params(batch_size=32, model='resnet50')
-  to
-    ['--batch_size=32', '--model=resnet50']
-  Args:
-    params: Params for BenchmarkCNN.
-  Returns:
-    A list of flags.
-  """
-  return [
-      '--%s=%s' % (k, str(v)) for k, v in six.iteritems(params._asdict())
-      if v != flags.param_specs[k].default_value
-  ]
-# When outputting a process's output in the log, maximum number of characters
-# to output. The log system does not allow us to output more than this in a
-# single log message, but this limit is also useful to avoid the logs from
-# becoming too large (the full process output is written to disk).
-MAX_OUTPUT_CHARS = 15000
-# A process. name is a string identifying the process in logs. stdout and
-# stderr are file objects of the process's stdout and stderr, respectively.
-_ProcessInfo = namedtuple('_ProcessInfo', ['name', 'popen', 'stdout', 'stderr'])
-def _create_task_process(job_name, task_index, args, env, output_dir):
-  """Creates a process for a single task for benchmark_cnn.
-  Args:
-    job_name: 'worker' or 'ps' or ''. Empty string used for non-distributed
-      mode.
-    task_index: The index of the task within the cluster.
-    args: A list of arguments to pass to the task. This function additionally
-      sets --task_index and --job_name
-    env: The environment to use for the task.
-    output_dir: Where to place the output files, storing the task's stdout and
-      stderr.
-  Returns:
-    A _ProcessInfo namedtuple of the running process. The stdout and stderr
-    fields of this tuple must be closed by the caller once the process ends.
-  """
-  args = args[:]
-  args += ['--task_index=%s' % task_index, '--job_name=%s' % job_name]
-  name_prefix = job_name or 'local'
-  process_name = '%s_%s' % (name_prefix, task_index)
-  tf.logging.info('Spawning %s process: %s' % (process_name, ' '.join(args)))
-  stdout_filename = os.path.join(output_dir, '%s_stdout.txt' % process_name)
-  stderr_filename = os.path.join(output_dir, '%s_stderr.txt' % process_name)
-  stdout_file = open(stdout_filename, 'w+')
-  stderr_file = open(stderr_filename, 'w+')
-  popen = subprocess.Popen(
-      args, stdout=stdout_file, stderr=stderr_file, env=env)
-  return _ProcessInfo(process_name, popen, stdout_file, stderr_file)
-def _wait_for_processes(wait_processes, kill_processes):
-  """Waits until all `wait_processes` finish, then kills `kill_processes`.
-  Fails an assert if a process in `wait_processes` finishes unsuccessfully.
-  The processes in `kill_processes` are assumed to never finish so they are
-  killed.
-  Args:
-    wait_processes: A list of _ProcessInfo tuples. This function will wait
-      for each to finish.
-    kill_processes: A list of _ProcessInfo tuples. Each will be killed once
-      every process in `wait_processes` is finished.
-  Returns:
-    A list of strings, each which is a string of the stdout of a wait process.
-  """
-  wait_process_stdouts = [None] * len(wait_processes)
-  finished_wait_processes = set()
-  while len(finished_wait_processes) < len(wait_processes):
-    for i, wait_process in enumerate(wait_processes):
-      if i in finished_wait_processes:
-        continue
-      ret_code = wait_process.popen.poll()
-      if ret_code is None:
-        continue
-      tf.logging.info('{} finished'.format(wait_process.name))
-      wait_process.stdout.seek(0)
-      wait_process_stdouts[i] = wait_process.stdout.read()
-      tf.logging.info('stdout for {} (last {} chars): {}\n'.format(
-          wait_process.name, MAX_OUTPUT_CHARS,
-          wait_process_stdouts[i][-MAX_OUTPUT_CHARS:]))
-      wait_process.stderr.seek(0)
-      tf.logging.info('stderr for {} (last {} chars): {}\n'.format(
-          wait_process.name, MAX_OUTPUT_CHARS,
-          wait_process.stderr.read()[-MAX_OUTPUT_CHARS:]))
-      assert ret_code == 0, 'Process failed with return code %d' % ret_code
-      finished_wait_processes.add(i)
-    for kill_process in kill_processes:
-      ret_code = kill_process.popen.poll()
-      # kill processes should not end until we kill them.
-      assert ret_code is None, 'Process returned early with code %d' % ret_code
-    time.sleep(0.25)
-  tf.logging.info('All wait processes finished')
-  for i, kill_process in enumerate(kill_processes):
-    # Kill each kill process.
-    kill_process.popen.kill()
-    kill_process.popen.wait()
-    kill_process.stdout.seek(0)
-    tf.logging.info('stdout for {} (last {} chars): {}\n'.format(
-        kill_process.name, MAX_OUTPUT_CHARS,
-        kill_process.stdout.read()[-MAX_OUTPUT_CHARS:]))
-    kill_process.stderr.seek(0)
-    tf.logging.info('stderr for {} (last {} chars): {}\n'.format(
-        kill_process.name, MAX_OUTPUT_CHARS,
-        kill_process.stderr.read()[-MAX_OUTPUT_CHARS:]))
-  return wait_process_stdouts
-def _spawn_benchmark_processes(output_dir_path, num_workers, num_ps,
-                               num_controllers, params):
-  """Run training or evaluation in spawned processes.
-  Runs locally if num_workers == 1, num_ps == 0, and num_controllers == 0,
-  otherwise runs in distributed mode. In either case, one process is spawned
-  per worker and ps. Waits for training/evaluation to finish before returning.
-  Args:
-    output_dir_path: Relative path where stdout and stderr files will be
-      placed.
-    num_workers: Number of workers to spawn.
-    num_ps: Number of ps processes to spawn.
-    num_controllers: Number of controller processes to spawn (must be 0 or 1).
-    params: Params for BenchmarkCNN in each subprocess.
-  Returns:
-    A list output_list of outputs from all processes that output the
-    images/sec and accuracy. This process is the controller host in
-    distributed_all_reduce, and the workers otherwise. output_list[i] is a
-    list of lines from the ith worker's stdout.
-  """
-  run_distributed = num_workers != 1 or num_ps != 0 or num_controllers != 0
-  if params.variable_update == 'distributed_all_reduce':
-    assert num_controllers == 1 or not run_distributed
-    assert num_ps == 0
-  else:
-    assert num_controllers == 0
-  output_base_dir = platforms_util.get_test_output_dir()
-  output_dir = os.path.join(output_base_dir, output_dir_path)
-  os.makedirs(output_dir)
-  tf.logging.info('Outputs of processes will be outputted to: %s' % output_dir)
-  args = platforms_util.get_command_to_run_python_module(
-      'benchmark_cnn_distributed_test_runner')
-  args += _convert_params_to_flags_list(params)
-  if run_distributed:
-    worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
-    ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
-    controller_ports = [portpicker.pick_unused_port()
-                        for _ in range(num_controllers)]
-    # The numerator is 0.7 instead of 1 to leave some memory for the Cuda
-    # runtime, etc.
-    gpu_memory_frac = 0.7 / num_workers
-    args += [
-        '--gpu_memory_frac_for_testing=%f' % gpu_memory_frac,
-        '--worker_hosts=' + ','.join('localhost:%d' % p for p in worker_ports)
-    ]
-    if num_ps > 0:
-      ps_hosts_str = ','.join('localhost:%d' % p for p in ps_ports)
-      args.append('--ps_hosts=' + ps_hosts_str)
-    else:
-      controller_host_str = ','.join('localhost:%d' % p
-                                     for p in controller_ports)
-      args.append('--controller_host=' + controller_host_str)
-  env = os.environ.copy()
-  # Allow stdout to be viewed before the process ends.
-  env['PYTHONUNBUFFERED'] = '1'
-  worker_processes = []
-  ps_processes = []
-  controller_processes = []
-  try:
-    for i in range(num_workers):
-      job_name = 'worker' if run_distributed else ''
-      process = _create_task_process(job_name, i, args, env, output_dir)
-      worker_processes.append(process)
-    # Don't let ps or controller processes use the gpu.
-    env['CUDA_VISIBLE_DEVICES'] = ''
-    for i in range(num_ps):
-      process = _create_task_process('ps', i, args, env, output_dir)
-      ps_processes.append(process)
-    for i in range(num_controllers):
-      process = _create_task_process('controller', i, args, env, output_dir)
-      controller_processes.append(process)
-    # If all distributed all reduce mode is being used, the controller process
-    # finishes and the worker processes block forever. Otherwise, the worker
-    # processes finish and the ps processes block forever. We set
-    # wait_processes and kill_processes accordingly.
-    if controller_processes:
-      wait_processes = controller_processes
-      kill_processes = worker_processes
-    else:
-      wait_processes = worker_processes
-      kill_processes = ps_processes
-    outputs = _wait_for_processes(wait_processes, kill_processes)
-  finally:
-    for process in worker_processes + ps_processes + controller_processes:
-      try:
-        process.popen.kill()
-      except OSError:
-        pass  # It's OK (and expected) if the process already exited.
-      process.stdout.close()
-      process.stderr.close()
-  return [output.splitlines() for output in outputs]
-# When this test class is run, a method will fail about 0.3% of the time with a
-# gRPC error. It is not clear why this occurs.
-# TODO(reedwm): Fix this test class.
-class TfCnnBenchmarksDistributedTest(tf.test.TestCase):
-  """Tests running benchmark_cnn in distributed mode."""
-  # We cannot check for a GPU via tf.test.is_gpu_available() before the tests in
-  # this class because it allocates all the GPU memory which would cause the
-  # spawned processes to run out of GPU memory.
-  def _test_distributed(self,
-                        test_name,
-                        num_workers,
-                        num_ps,
-                        params,
-                        num_controllers=0,
-                        check_output_values=False,
-                        skip=None):
-    # TODO(reedwm): check_output_values should default to True and be enabled
-    # on every test. See the TODO in benchmark_cnn_test.py.
-    def run_fn(run_type, inner_params):
-      output_dir_path = os.path.join(test_name, run_type)
-      if run_type == 'Evaluation':
-        # Distributed evaluation is not supported, so we use a single process.
-        # We still must spawn another process, because if we evaluate in the
-        # current process, it would allocate the GPU memory causing future test
-        # methods to fail.
-        if inner_params.variable_update == 'distributed_replicated':
-          inner_params = inner_params._replace(variable_update='replicated')
-        return _spawn_benchmark_processes(
-            output_dir_path, num_workers=1, num_ps=0, num_controllers=0,
-            params=inner_params)
-      else:
-        return _spawn_benchmark_processes(output_dir_path, num_workers, num_ps,
-                                          num_controllers, inner_params)
-    return test_util.train_and_eval(self, run_fn, params,
-                                    check_output_values=check_output_values,
-                                    skip=skip)
-  def testParameterServer(self):
-    test_name = 'testParameterServer'
-    params = test_util.get_params(test_name)
-    self._test_distributed(test_name, 2, 2, params)
-  def testParameterServerStaged(self):
-    test_name = 'testParameterServerStaged'
-    params = test_util.get_params(test_name)._replace(staged_vars=True)
-    self._test_distributed(test_name, 2, 2, params)
-  def testReplicated(self):
-    test_name = 'testReplicated'
-    params = test_util.get_params(test_name)._replace(
-        variable_update='distributed_replicated')
-    self._test_distributed(test_name, 2, 2, params)
-  def testAllReducePsgpu(self):
-    test_name = 'testAllReducePsgpu'
-    flags_dict = test_util.get_params(test_name)._replace(
-        variable_update='distributed_all_reduce',
-        all_reduce_spec='psgpu#4')
-    self._test_distributed(test_name, 2, 0, flags_dict, num_controllers=1)
-  def testAllReducePscpuXring(self):
-    test_name = 'testAllReducePscpuXring'
-    flags_dict = test_util.get_params(test_name)._replace(
-        variable_update='distributed_all_reduce',
-        all_reduce_spec='pscpu:2k:xring')
-    self._test_distributed(test_name, 2, 0, flags_dict, num_controllers=1)
-  def testForwardOnly(self):
-    test_name = 'testForwardOnly'
-    params = test_util.get_params(test_name)._replace(forward_only=True)
-    # Evaluation is not supported with --forward_only, so we set skip='eval'.
-    self._test_distributed(test_name, 2, 2, params, skip='eval')
-  def testSingleWorkerAndPs(self):
-    test_name = 'testSingleWorkerAndPs'
-    params = test_util.get_params(test_name)
-    self._test_distributed(test_name, 1, 1, params)
-  def testThreeWorkersAndPses(self):
-    test_name = 'testThreeWorkersAndPses'
-    params = test_util.get_params(test_name)
-    self._test_distributed(test_name, 3, 3, params)
-  def testOneWorkerThreePses(self):
-    test_name = 'testOneWorkerThreePses'
-    params = test_util.get_params(test_name)
-    self._test_distributed(test_name, 1, 3, params)
-  def testThreeWorkersOnePs(self):
-    test_name = 'testThreeWorkersOnePs'
-    params = test_util.get_params(test_name)
-    self._test_distributed(test_name, 3, 1, params)
-  def testNoPrintTrainingAccuracy(self):
-    test_name = 'testNoPrintTrainingAccuracy'
-    params = test_util.get_params(test_name)._replace(
-        print_training_accuracy=False)
-    self._test_distributed(test_name, 2, 2, params)
-  def testRmspropParameterServer(self):
-    test_name = 'testRmspropParameterServer'
-    params = test_util.get_params(test_name)._replace(optimizer='rmsprop')
-    self._test_distributed(test_name, 2, 2, params)
-  def testMomentumReplicated(self):
-    test_name = 'testMomentumReplicated'
-    params = test_util.get_params(test_name)._replace(
-        optimizer='momentum', variable_update='distributed_replicated')
-    self._test_distributed(test_name, 2, 2, params)
-  def testNoCrossReplicaSyncParameterServerStaged(self):
-    test_name = 'testNoCrossReplicaSyncParameterServerStaged'
-    params = test_util.get_params(test_name)._replace(
-        staged_vars=True, cross_replica_sync=False)
-    self._test_distributed(test_name, 2, 2, params)
-  def testSingleGpu(self):
-    test_name = 'testSingleGpu'
-    params = test_util.get_params(test_name)._replace(num_gpus=1)
-    self._test_distributed(test_name, 2, 2, params)
-  def testBatchGroupSize(self):
-    test_name = 'testBatchGroupSize'
-    params = test_util.get_params(test_name)._replace(
-        batch_group_size=4, num_batches=100, num_warmup_batches=5)
-    self._test_distributed(test_name, 2, 2, params)
-  def testFp16WithFp32Vars(self):
-    test_name = 'testFp16WithFp32Vars'
-    params = test_util.get_params(test_name)._replace(
-        use_fp16=True, fp16_vars=False)
-    self._test_distributed(test_name, 2, 2, params)
-  def testFp16WithFp16Vars(self):
-    test_name = 'testFp16WithFp16Vars'
-    params = test_util.get_params(test_name)._replace(
-        use_fp16=True, fp16_vars=True, fp16_loss_scale=1.)
-    self._test_distributed(test_name, 2, 2, params)
-  def testFp16Replicated(self):
-    test_name = 'testFp16Replicated'
-    params = test_util.get_params(test_name)._replace(
-        use_fp16=True, variable_update='distributed_replicated')
-    self._test_distributed(test_name, 2, 2, params)
-  @unittest.skip('b/147310862: Fails for unknown reason')
-  def testReplicatedRealData(self):
-    test_name = 'testReplicatedRealData'
-    imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
-                                'fake_tf_record_data')
-    params = test_util.get_params(test_name)._replace(
-        variable_update='distributed_replicated',
-        data_dir=imagenet_dir,
-        data_name='imagenet')
-    self._test_distributed(test_name, 2, 2, params)
-class DistributedVariableUpdateTest(tf.test.TestCase):
-  """Tests that variables are updated correctly in distributed mode."""
-  def _test_variable_update(self,
-                            test_name,
-                            num_workers,
-                            num_ps,
-                            params,
-                            num_controllers=0):
-    """Tests variables are updated correctly when the given params are used."""
-    output_dir_path = os.path.join(test_name, 'variable_update')
-    logs = _spawn_benchmark_processes(output_dir_path, num_workers, num_ps,
-                                      num_controllers, params)
-    actual_losses = []
-    for worker_logs in logs:
-      outputs = test_util.get_training_outputs_from_logs(
-          worker_logs, params.print_training_accuracy)
-      actual_losses.append([x.loss for x in outputs])
-    inputs = test_util.get_fake_var_update_inputs()
-    expected_losses = test_util.TestCNNModel().manually_compute_losses(
-        inputs, num_workers, params)
-    if params.variable_update == 'distributed_all_reduce':
-      # In distributed all reduce, each step, the controller outputs the average
-      # of the loss from each worker. So we modify expected losses accordingly.
-      # E.g, we change [[1, 2], [4, 5]] to [[2.5, 3.5]]
-      expected_losses = [[sum(losses) / num_workers
-                          for losses in zip(*expected_losses)]]
-    rtol = 3e-2 if params.use_fp16 else 1e-5
-    for worker_actual_losses, worker_expected_losses in zip(actual_losses,
-                                                            expected_losses):
-      self.assertAllClose(worker_actual_losses[:len(worker_expected_losses)],
-                          worker_expected_losses, rtol=rtol, atol=0.)
-  def _test_variable_updates(self, test_name, params):
-    """Tests variables are updated correctly with various variable updates."""
-    # Unfortunately, distributed parameter server is non-deterministic with
-    # multiple workers, because one worker may write to a variable before
-    # another worker reads it. This probably does not harm training, but it
-    # does mean we cannot easily test that case. So, we use one worker.
-    self._test_variable_update(
-        test_name + '_ps', num_workers=1, num_ps=2, num_controllers=0,
-        params=params._replace(variable_update='parameter_server'))
-    self._test_variable_update(
-        test_name + '_rep', num_workers=2, num_ps=1, num_controllers=0,
-        params=params._replace(variable_update='distributed_replicated'))
-    self._test_variable_update(
-        test_name + '_allreduce', num_workers=2, num_ps=0, num_controllers=1,
-        params=params._replace(variable_update='distributed_all_reduce',
-                               all_reduce_spec='psgpu#%d' % params.num_gpus))
-  def testVarUpdateDefault(self):
-    params = test_util.get_var_update_params()
-    self._test_variable_updates('testVarUpdateDefault', params)
-  def testVarUpdateCpuAsLocalParamDevice(self):
-    params = test_util.get_var_update_params()._replace(
-        local_parameter_device='cpu')
-    self._test_variable_updates('testVarUpdateCpuAsLocalParamDevice', params)
-  def testVarUpdateFp16(self):
-    params = test_util.get_var_update_params()._replace(use_fp16=True)
-    self._test_variable_updates('testVarUpdateFp16', params)
-  def testVarUpdateResourceVars(self):
-    params = test_util.get_var_update_params()._replace(use_resource_vars=True)
-    self._test_variable_updates('testVarUpdateResourceVars', params)
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  tf.test.main()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test_runner.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_distributed_test_runner.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Used to run benchmark_cnn for distributed tests.
-In distributed tests, we spawn processes to run tf_cnn_benchmark tasks. We could
-directly spawn tf_cnn_benchmark processes, but we want some added functionality,
-such as being able to inject custom images during training. So instead, this
-file is spawned as a Python process, which supports the added functionality.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from absl import flags as absl_flags
-import numpy as np
-import tensorflow.compat.v1 as tf
-import benchmark_cnn
-import flags
-import preprocessing
-import test_util
-absl_flags.DEFINE_string('fake_input', 'none',
-                         """What fake input to inject into benchmark_cnn. This
-                            is ignored if --model=test_model.
-                            Options are:
-                            none: Do not inject any fake input.
-                            zeros_and_ones: Half the images will be all 0s with
-                            a label of 0. Half the images will be all 1s with a
-                            label of 1.""")
-flags.define_flags()
-FLAGS = flags.FLAGS
-def get_test_image_preprocessor(batch_size, params):
-  """Returns the preprocessing.TestImagePreprocessor that should be injected.
-  Returns None if no preprocessor should be injected.
-  Args:
-    batch_size: The batch size across all GPUs.
-    params: BenchmarkCNN's parameters.
-  Returns:
-    Returns the preprocessing.TestImagePreprocessor that should be injected.
-  Raises:
-    ValueError: Flag --fake_input is an invalid value.
-  """
-  if FLAGS.fake_input == 'none':
-    return None
-  elif FLAGS.fake_input == 'zeros_and_ones':
-    half_batch_size = batch_size // 2
-    images = np.zeros((batch_size, 227, 227, 3), dtype=np.float32)
-    images[half_batch_size:, :, :, :] = 1
-    labels = np.array([0] * half_batch_size + [1] * half_batch_size,
-                      dtype=np.int32)
-    preprocessor = preprocessing.TestImagePreprocessor(
-        batch_size, [227, 227, 3], params.num_gpus,
-        benchmark_cnn.get_data_type(params))
-    preprocessor.set_fake_data(images, labels)
-    preprocessor.expected_subset = 'validation' if params.eval else 'train'
-    return preprocessor
-  else:
-    raise ValueError('Invalid --fake_input: %s' % FLAGS.fake_input)
-def run_with_real_model(params):
-  """Runs tf_cnn_benchmarks with a real model."""
-  bench = benchmark_cnn.BenchmarkCNN(params)
-  bench.print_info()
-  preprocessor = get_test_image_preprocessor(bench.batch_size, params)
-  if preprocessor is not None:
-    # The test image preprocessor requires queue runners. Since this file is
-    # used for testing, it is OK to access protected members.
-    # pylint: disable=protected-access
-    bench.dataset._queue_runner_required = True
-    # pylint: enable=protected-access
-    bench.input_preprocessor = preprocessor
-  bench.run()
-def run_with_test_model(params):
-  """Runs tf_cnn_benchmarks with a test model."""
-  model = test_util.TestCNNModel()
-  inputs = test_util.get_fake_var_update_inputs()
-  with test_util.monkey_patch(benchmark_cnn,
-                              LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15):
-    bench = benchmark_cnn.BenchmarkCNN(params, dataset=test_util.TestDataSet(),
-                                       model=model)
-    # The test model does not use labels when computing loss, so the label
-    # values do not matter as long as it's the right shape.
-    labels = np.array([1] * inputs.shape[0])
-    bench.input_preprocessor.set_fake_data(inputs, labels)
-    bench.run()
-def main(_):
-  params = benchmark_cnn.make_params_from_flags()
-  params = benchmark_cnn.setup(params)
-  if params.model == 'test_model':
-    run_with_test_model(params)
-  else:
-    run_with_real_model(params)
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  tf.app.run()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_test.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/benchmark_cnn_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for benchmark_cnn."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import glob
-import os
-import re
-import unittest
-import mock
-import numpy as np
-import tensorflow.compat.v1 as tf
-from google.protobuf import text_format
-from tensorflow.core.framework import step_stats_pb2
-from tensorflow.core.profiler import tfprof_log_pb2
-from tensorflow.python.platform import test
-import benchmark_cnn
-import datasets
-import flags
-import preprocessing
-import test_util
-import variable_mgr_util
-from platforms import util as platforms_util
-def _check_has_gpu():
-  if not test.is_gpu_available(cuda_only=True):
-    raise ValueError(
-        """You have asked to run part or all of this on GPU, but it appears
-        that no GPU is available. If your machine has GPUs it is possible you
-        do not have a version of TensorFlow with GPU support. To build with GPU
-        support, add --config=cuda to the build flags.\n """)
-class TfCnnBenchmarksModelTest(tf.test.TestCase):
-  """Tests which are run with multiple models."""
-  def setUp(self):
-    super(TfCnnBenchmarksModelTest, self).setUp()
-    benchmark_cnn.setup(benchmark_cnn.make_params())
-  def get_model_name(self):
-    return None
-  # Return true to run tests that don't need to be run on every model.
-  # This should be done for one or two cheap models.
-  def extended_tests(self):
-    return False
-  # Return false to suppress actually running the model; this is useful
-  # for tests that are large.
-  def model_execution_test(self):
-    return False
-  # Return false to suppress actually saving and loading the model.
-  def model_save_load_test(self):
-    return False
-  def testSaveLoadModel(self):
-    _check_has_gpu()
-    if not self.get_model_name() or not self.model_save_load_test():
-      return
-    params = benchmark_cnn.make_params(
-        model=self.get_model_name(),
-        num_batches=1,
-        num_intra_threads=0,
-        num_inter_threads=0,
-        distortions=False,
-        batch_size=2,
-        variable_update='replicated',
-        num_warmup_batches=0,
-        num_gpus=2,
-        train_dir=test_util.get_temp_dir('testSaveLoadModel_' +
-                                         self.get_model_name()))
-    # Run one batch and save the model.
-    # Note that this uses a non-test session.
-    bench = benchmark_cnn.BenchmarkCNN(params)
-    bench.run()
-    self.assertEqual(bench.init_global_step, 0)
-    # Clear the default graph.
-    tf.reset_default_graph()
-    # Test if checkpoint had been saved.
-    ckpt = tf.train.get_checkpoint_state(params.train_dir)
-    match = re.match(os.path.join(params.train_dir, r'model.ckpt-(\d+).index'),
-                     ckpt.model_checkpoint_path + '.index')
-    self.assertTrue(match)
-    self.assertGreaterEqual(int(match.group(1)), params.num_batches)
-    params = params._replace(num_batches=2)
-    # Reload the model
-    bench = benchmark_cnn.BenchmarkCNN(params)
-    bench.run()
-    # Check if global step has been restored.
-    self.assertNotEqual(bench.init_global_step, 0)
-    ckpt = tf.train.get_checkpoint_state(params.train_dir)
-    match = re.match(os.path.join(params.train_dir, r'model.ckpt-(\d+).index'),
-                     ckpt.model_checkpoint_path + '.index')
-    self.assertTrue(match)
-    self.assertGreaterEqual(int(match.group(1)), params.num_batches)
-    # Check that the batch norm moving averages are restored from checkpoints
-    with tf.Graph().as_default():
-      bench = benchmark_cnn.BenchmarkCNN(params)
-      bench._build_model()
-      saver = tf.train.Saver(bench.variable_mgr.savable_variables())
-      with tf.Session(config=benchmark_cnn.create_config_proto(params)) as sess:
-        benchmark_cnn.load_checkpoint(saver, sess, params.train_dir)
-        sess.run(bench.variable_mgr.get_post_init_ops())
-        bn_moving_vars = [
-            v for v in tf.global_variables()
-            if '/batchnorm' in v.name and '/moving' in v.name
-        ]
-        self.assertGreater(len(bn_moving_vars), 0)
-        for moving_var in bn_moving_vars:
-          moving_var_value = sess.run(moving_var)
-          # Check that the moving means and moving variances have been restored
-          # by asserting they are not their default values of 0 and 1,
-          # respectively
-          if '/moving_mean' in moving_var.name:
-            self.assertFalse(np.array_equal(moving_var_value,
-                                            np.zeros(moving_var_value.shape,
-                                                     moving_var_value.dtype)))
-          else:
-            self.assertIn('/moving_variance', moving_var.name)
-            self.assertFalse(np.array_equal(moving_var_value,
-                                            np.ones(moving_var_value.shape,
-                                                    moving_var_value.dtype)))
-  def testModel(self):
-    _check_has_gpu()
-    if not self.get_model_name() or not self.model_execution_test():
-      return
-    params = benchmark_cnn.make_params(
-        model=self.get_model_name(),
-        num_batches=1,
-        num_intra_threads=1,
-        num_inter_threads=12,
-        batch_size=2,
-        distortions=False)
-    # Run this one; note that this uses a non-test session.
-    bench = benchmark_cnn.BenchmarkCNN(params)
-    bench.run()
-  def testSendRecvVariables(self):
-    self._testVariables('parameter_server')
-    if self.extended_tests():
-      self._testVariables('parameter_server', local_parameter_device='CPU')
-      self._testVariables('parameter_server', optimizer='sgd')
-  def testReplicatedVariables(self):
-    self._testVariables('replicated')
-    if self.extended_tests():
-      self._testVariables('replicated', all_reduce_spec=None)
-      self._testVariables('replicated', use_fp16=True, fp16_vars=False)
-      self._testVariables(
-          'replicated',
-          all_reduce_spec=None,
-          use_fp16=True,
-          fp16_vars=False,
-          fp16_enable_auto_loss_scale=True,
-          fp16_inc_loss_scale_every_n=4)
-  def testIndependentVariables(self):
-    self._testVariables('independent')
-    self._testVariables(
-        'independent',
-        all_reduce_spec=None,
-        use_fp16=True,
-        fp16_vars=False,
-        fp16_enable_auto_loss_scale=True,
-        fp16_inc_loss_scale_every_n=4)
-  def testSummaryVerbosity(self):
-    self._testVariables('parameter_server', summary_verbosity=1)
-    if self.extended_tests():
-      self._testVariables('parameter_server', summary_verbosity=2)
-      self._testVariables('parameter_server', summary_verbosity=3)
-  def testStagedVariables(self):
-    self._testVariables('parameter_server', staged_vars=True)
-    if self.extended_tests():
-      self._testVariables('parameter_server', staged_vars=True,
-                          local_parameter_device='CPU')
-      self._testVariables('parameter_server', staged_vars=True, use_fp16=True,
-                          fp16_vars=True)
-  def _assert_correct_var_type(self, var, params):
-    if 'gpu_cached_inputs' not in var.name:
-      if params.use_fp16 and params.fp16_vars and 'batchnorm' not in var.name:
-        expected_type = tf.float16
-      else:
-        expected_type = tf.float32
-      self.assertEqual(var.dtype.base_dtype, expected_type)
-  def _testVariables(self,
-                     variable_update,
-                     summary_verbosity=0,
-                     local_parameter_device='GPU',
-                     staged_vars=False,
-                     optimizer='momentum',
-                     # TODO(b/80125832): Enable nccl in tests
-                     # all_reduce_spec='nccl',
-                     all_reduce_spec='',
-                     use_fp16=False,
-                     fp16_vars=False,
-                     fp16_enable_auto_loss_scale=False,
-                     fp16_inc_loss_scale_every_n=10):
-    if not self.get_model_name():
-      return
-    _check_has_gpu()
-    params = benchmark_cnn.make_params(
-        model=self.get_model_name(),
-        num_batches=1,
-        num_intra_threads=1,
-        num_inter_threads=12,
-        distortions=False,
-        variable_update=variable_update,
-        local_parameter_device=local_parameter_device,
-        num_gpus=2,
-        summary_verbosity=summary_verbosity,
-        staged_vars=staged_vars,
-        optimizer=optimizer,
-        all_reduce_spec=all_reduce_spec,
-        compact_gradient_transfer=False if all_reduce_spec == 'nccl' else True,
-        use_fp16=use_fp16,
-        fp16_loss_scale=2.,
-        fp16_vars=fp16_vars,
-        fp16_enable_auto_loss_scale=fp16_enable_auto_loss_scale,
-        fp16_inc_loss_scale_every_n=fp16_inc_loss_scale_every_n,
-    )
-    # Test building models using multiple GPUs, but don't
-    # run them.
-    with self.test_session(graph=tf.Graph()):
-      bench = benchmark_cnn.BenchmarkCNN(params)
-      bench._build_model()
-      # Rough validation of variable type and placement, depending on mode.
-      all_vars = tf.global_variables() + tf.local_variables()
-      if params.variable_update == 'parameter_server':
-        for v in all_vars:
-          tf.logging.debug('var: %s' % v.name)
-          match = re.match(r'tower_(\d+)/v/gpu_cached_inputs:0', v.name)
-          if match:
-            self.assertEqual(v.device, '/device:GPU:%s' % match.group(1))
-          elif v.name.startswith('v/'):
-            self.assertEqual(v.device, '/device:%s:0' % local_parameter_device)
-            self._assert_correct_var_type(v, params)
-          elif v.name in ('input_processing/images:0',
-                          'input_processing/labels:0', 'init_learning_rate:0',
-                          'global_step:0', 'loss_scale:0',
-                          'loss_scale_normal_steps:0'):
-            self.assertEqual(v.device, '/device:CPU:0')
-          else:
-            raise ValueError('Unexpected variable %s' % v.name)
-      else:
-        v0_count = 0
-        v1_count = 0
-        for v in all_vars:
-          if v.name.startswith('tower_0/v0/'):
-            self.assertEqual(v.name, 'tower_0/v0/gpu_cached_inputs:0')
-            self.assertEqual(v.device, '/device:GPU:0')
-          elif v.name.startswith('tower_1/v1/'):
-            self.assertEqual(v.name, 'tower_1/v1/gpu_cached_inputs:0')
-            self.assertEqual(v.device, '/device:GPU:1')
-          elif v.name.startswith('v0/'):
-            v0_count += 1
-            self.assertEqual(v.device, '/device:GPU:0')
-            self._assert_correct_var_type(v, params)
-          elif v.name.startswith('v1/'):
-            v1_count += 1
-            self.assertEqual(v.device, '/device:GPU:1')
-            self._assert_correct_var_type(v, params)
-          elif v.name in ('input_processing/images:0',
-                          'input_processing/labels:0', 'init_learning_rate:0',
-                          'global_step:0', 'loss_scale:0',
-                          'loss_scale_normal_steps:0'):
-            self.assertEqual(v.device, '/device:CPU:0')
-          else:
-            raise ValueError('Unexpected variable %s' % v.name)
-        self.assertEqual(v0_count, v1_count)
-      # Validate summary ops in the model depending on verbosity level
-      summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES)
-      num_summary_ops = len(summary_ops)
-      self.assertEqual(num_summary_ops > 0, summary_verbosity > 0)
-      if summary_verbosity > 0:
-        has_affine_histogram = False
-        has_gradient_histogram = False
-        has_log_gradients_histogram = False
-        for op in summary_ops:
-          if '/gradients' in op.name:
-            has_gradient_histogram = True
-          elif '/affine' in op.name:
-            has_affine_histogram = True
-          elif 'log_gradients' in op.name:
-            has_log_gradients_histogram = True
-        self.assertEqual(summary_verbosity >= 3, has_affine_histogram)
-        self.assertEqual(summary_verbosity >= 3, has_gradient_histogram)
-        self.assertEqual(summary_verbosity >= 2, has_log_gradients_histogram)
-        if summary_verbosity == 1:
-          self.assertLess(num_summary_ops, 10)
-class TrivialModelTest(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'trivial'
-class TestVgg1Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'vgg11'
-class TestVgg19Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'vgg19'
-class TestLenet5Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'lenet'
-class TestGooglenetModel(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'googlenet'
-class TestOverfeatModel(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'overfeat'
-class TestAlexnetModel(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'alexnet'
-  def extended_tests(self):
-    return True
-class TestTrivialModel(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'trivial'
-class TestInceptionv3Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'inception3'
-  def extended_tests(self):
-    return True
-class TestInceptionv4Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'inception4'
-class TestResnet50Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'resnet50'
-  def model_save_load_test(self):
-    return True
-class TestResnet101Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'resnet101'
-class TestResnet152Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'resnet152'
-class TestResnet50V2Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'resnet50_v2'
-class TestResnet101V2Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'resnet101_v2'
-class TestResnet152V2Model(TfCnnBenchmarksModelTest):
-  def get_model_name(self):
-    return 'resnet152_v2'
-class TfCnnBenchmarksTest(tf.test.TestCase):
-  """Tests that benchmark_cnn runs correctly."""
-  def setUp(self):
-    super(TfCnnBenchmarksTest, self).setUp()
-    _check_has_gpu()
-    benchmark_cnn.setup(benchmark_cnn.make_params())
-  def _run_benchmark_cnn(self, params):
-    logs = []
-    benchmark_cnn.log_fn = test_util.print_and_add_to_list(logs)
-    benchmark_cnn.BenchmarkCNN(params).run()
-    return logs
-  def _run_benchmark_cnn_with_fake_images(self, params, images, labels):
-    logs = []
-    benchmark_cnn.log_fn = test_util.print_and_add_to_list(logs)
-    bench = benchmark_cnn.BenchmarkCNN(params)
-    bench.input_preprocessor = preprocessing.TestImagePreprocessor(
-        params.batch_size * params.num_gpus,
-        [[params.batch_size, 227, 227, 3], [params.batch_size]],
-        params.num_gpus,
-        bench.model.data_type)
-    bench.dataset._queue_runner_required = True
-    bench.input_preprocessor.set_fake_data(images, labels)
-    bench.input_preprocessor.expected_subset = ('validation'
-                                                if params.eval else 'train')
-    bench.run()
-    return logs
-  def _run_benchmark_cnn_with_black_and_white_images(self, params):
-    """Runs BenchmarkCNN with black and white images.
-    A BenchmarkCNN is created and run with black and white images as input. Half
-    the images are black (i.e., filled with 0s) and half are white (i.e., filled
-    with 255s).
-    Args:
-      params: Params for BenchmarkCNN.
-    Returns:
-      A list of lines from the output of BenchmarkCNN.
-    """
-    # TODO(reedwm): Instead of generating images here, use black and white
-    # tfrecords by calling test_util.create_black_and_white_images().
-    effective_batch_size = params.batch_size * params.num_gpus
-    half_batch_size = effective_batch_size // 2
-    images = np.zeros((effective_batch_size, 227, 227, 3), dtype=np.float32)
-    images[half_batch_size:, :, :, :] = 255
-    labels = np.array([0] * half_batch_size + [1] * half_batch_size,
-                      dtype=np.int32)
-    return self._run_benchmark_cnn_with_fake_images(params, images, labels)
-  def _train_and_eval_local(self,
-                            params,
-                            check_output_values=False,
-                            max_final_loss=10.,
-                            skip=None,
-                            use_test_preprocessor=True):
-    # TODO(reedwm): check_output_values should default to True and be enabled
-    # on every test. Currently, if check_output_values=True and the calls to
-    # tf.set_random_seed(...) and np.seed(...) are passed certain seed values in
-    # benchmark_cnn.py, then most tests will fail. This indicates the tests
-    # are brittle and could fail with small changes when
-    # check_output_values=True, so check_output_values defaults to False for
-    # now.
-    def run_fn(run_type, inner_params):
-      del run_type
-      if use_test_preprocessor:
-        return [
-            self._run_benchmark_cnn_with_black_and_white_images(inner_params)
-        ]
-      else:
-        return [self._run_benchmark_cnn(inner_params)]
-    return test_util.train_and_eval(self, run_fn, params,
-                                    check_output_values=check_output_values,
-                                    max_final_loss=max_final_loss,
-                                    skip=skip)
-  def testAlexnet(self):
-    params = test_util.get_params('testAlexnet')._replace(
-        num_batches=30, init_learning_rate=0.01, model='alexnet')
-    self._train_and_eval_local(params)
-  def testNoPrintAccuracy(self):
-    params = test_util.get_params('testNoPrintAccuracy')._replace(
-        print_training_accuracy=False)
-    self._train_and_eval_local(params)
-  def testLowAccuracy(self):
-    params = test_util.get_params('testLowAccuracy')._replace(
-        print_training_accuracy=True, batch_size=5, num_batches=10)
-    # We force low accuracy by having each batch containing 10 identical images,
-    # each with a different label. This guarantees a top-1 accuracy of exactly
-    # 0.1 and a top-5 accuracy of exactly 0.5.
-    images = np.zeros((10, 227, 227, 3), dtype=np.float32)
-    labels = np.arange(10, dtype=np.int32)
-    logs = self._run_benchmark_cnn_with_fake_images(params, images, labels)
-    training_outputs = test_util.get_training_outputs_from_logs(
-        logs, params.print_training_accuracy)
-    last_output = training_outputs[-1]
-    # TODO(reedwm): These should be assertEqual but for some reason,
-    # occasionally the accuracies are lower (Running this test 500 times, these
-    # asserts failed twice). Investigate this problem.
-    self.assertLessEqual(last_output.top_1_accuracy, 0.1)
-    self.assertLessEqual(last_output.top_5_accuracy, 0.5)
-  def testParameterServer(self):
-    params = test_util.get_params('testParameterServer')
-    self._train_and_eval_local(params)
-  def testParameterServerStaged(self):
-    params = test_util.get_params('testParameterServerStaged')._replace(
-        staged_vars=True)
-    self._train_and_eval_local(params)
-  def testReplicated(self):
-    params = test_util.get_params('testReplicated')._replace(
-        variable_update='replicated')
-    self._train_and_eval_local(params)
-  def testIndependent(self):
-    params = test_util.get_params('testIndependent')._replace(
-        variable_update='independent')
-    self._train_and_eval_local(params)
-  def testForwardOnly(self):
-    params = test_util.get_params('testForwardOnly')._replace(forward_only=True)
-    # Evaluation is not supported with --forward_only, so we set skip='eval'.
-    self._train_and_eval_local(params, skip='eval')
-  def testForwardOnlyAndFreeze(self):
-    params = test_util.get_params('testForwardOnlyAndFreeze')._replace(
-        forward_only=True, freeze_when_forward_only=True, train_dir=None)
-    # Training is not supported with --freeze_when_forward_only.
-    self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
-  def testNoDistortions(self):
-    params = test_util.get_params('testNoDistortions')._replace(
-        distortions=False)
-    self._train_and_eval_local(params)
-  def testCpuAsLocalParamDevice(self):
-    params = test_util.get_params('testCpuAsLocalParamDevice')._replace(
-        local_parameter_device='cpu')
-    self._train_and_eval_local(params)
-  def testNHWC(self):
-    params = test_util.get_params('testNHWC')._replace(data_format='NHWC')
-    self._train_and_eval_local(params)
-  def testCpuAsDevice(self):
-    params = test_util.get_params('testCpuAsDevice')._replace(
-        device='cpu', data_format='NHWC')  # NHWC required when --device=cpu
-    self._train_and_eval_local(params)
-  def testMomentumParameterServer(self):
-    params = test_util.get_params('testMomentumParameterServer')._replace(
-        optimizer='momentum', momentum=0.8)
-    self._train_and_eval_local(params)
-  def testRmspropReplicated(self):
-    params = test_util.get_params('testRmspropReplicated')._replace(
-        variable_update='replicated',
-        optimizer='rmsprop',
-        rmsprop_decay=0.8,
-        rmsprop_momentum=0.6,
-        rmsprop_epsilon=0.7,
-        init_learning_rate=0.01)
-    self._train_and_eval_local(params)
-  def testBatchGroupSize(self):
-    params = test_util.get_params('testBatchGroupSize')._replace(
-        batch_group_size=4, num_batches=100, num_warmup_batches=5)
-    self._train_and_eval_local(params)
-  def testGradientClip(self):
-    params = test_util.get_params('testGradientClip')._replace(
-        gradient_clip=100.0)
-    self._train_and_eval_local(params)
-  def testWeightDecay(self):
-    params = test_util.get_params('testWeightDecay')._replace(
-        weight_decay=0.0001)
-    self._train_and_eval_local(params)
-  def testNoLayers(self):
-    params = test_util.get_params('testNoLayers')._replace(use_tf_layers=False)
-    self._train_and_eval_local(params)
-  def testSaveModelSteps(self):
-    params = test_util.get_params('testSaveModelSteps')._replace(
-        save_model_steps=2, num_warmup_batches=0, num_batches=10,
-        max_ckpts_to_keep=3)
-    self._train_and_eval_local(params)
-    for i in range(1, 20 + 1):
-      # We train for 20 steps, since self._train_and_eval_local() does two
-      # training runs of 10 steps each. We save a checkpoint every 2 steps and
-      # keep the last 3 checkpoints, so at the end, we should have checkpoints
-      # for steps 16, 18, and 20.
-      matches = glob.glob(os.path.join(params.train_dir,
-                                       'model.ckpt-{}.*'.format(i)))
-      if i in (16, 18, 20):
-        self.assertTrue(matches)
-      else:
-        self.assertFalse(matches)
-  def testFp16WithFp32Vars(self):
-    params = test_util.get_params('testFp16WithFp32Vars')._replace(
-        use_fp16=True, fp16_vars=False, fp16_loss_scale=1.)
-    self._train_and_eval_local(params)
-  def testFp16WithFp16Vars(self):
-    params = test_util.get_params('testFp16WithFp16Vars')._replace(
-        use_fp16=True, fp16_vars=True)
-    self._train_and_eval_local(params)
-  def testXlaCompile(self):
-    params = test_util.get_params('testXlaCompile')._replace(xla_compile=True)
-    self._train_and_eval_local(params)
-  @unittest.skip('Fails for unknown reason')
-  def testXlaCompileWithFp16(self):
-    params = test_util.get_params('testXlaCompileWithFp16')._replace(
-        use_fp16=True, xla_compile=True)
-    self._train_and_eval_local(params)
-  def testGradientRepacking(self):
-    params = test_util.get_params('testGradientRepacking1')._replace(
-        gradient_repacking=2)
-    self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
-    params = test_util.get_params('testGradientRepacking2')._replace(
-        gradient_repacking=2, use_fp16=True)
-    self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
-  def testTraceFileChromeTraceFormat(self):
-    trace_file = os.path.join(self.get_temp_dir(),
-                              'testTraceFileChromeTraceFormat_tracefile')
-    params = test_util.get_params('testTraceFileChromeTraceFormat')._replace(
-        trace_file=trace_file, use_chrome_trace_format=True)
-    self._train_and_eval_local(params)
-    self.assertGreater(os.stat(trace_file).st_size, 0)
-  def testTraceFileStepStatsProto(self):
-    trace_file = os.path.join(self.get_temp_dir(),
-                              'testTraceFileStepStatsProto_tracefile')
-    params = test_util.get_params('testTraceFileStepStatsProto')._replace(
-        trace_file=trace_file, use_chrome_trace_format=False)
-    self._train_and_eval_local(params)
-    self.assertGreater(os.stat(trace_file).st_size, 0)
-    with open(trace_file) as f:
-      step_stats = step_stats_pb2.StepStats()
-      # The following statement should not raise an exception.
-      contents = f.read()
-      text_format.Merge(contents, step_stats)
-  def testTfprofFile(self):
-    tfprof_file = os.path.join(self.get_temp_dir(), 'testTfprofFile_tfproffile')
-    params = test_util.get_params('testTfprofFile')._replace(
-        tfprof_file=tfprof_file)
-    self._train_and_eval_local(params, skip='eval_and_train_from_checkpoint')
-    self.assertGreater(os.stat(tfprof_file).st_size, 0)
-    with open(tfprof_file, 'rb') as f:
-      profile_proto = tfprof_log_pb2.ProfileProto()
-      # The following statement should not raise an exception.
-      profile_proto.ParseFromString(f.read())
-  @unittest.skip('Fails for unknown reason')
-  def testMoveTrainDir(self):
-    params = test_util.get_params('testMoveTrainDir')
-    self._train_and_eval_local(params)
-    new_train_dir = params.train_dir + '_moved'
-    os.rename(params.train_dir, new_train_dir)
-    params = params._replace(train_dir=new_train_dir, eval=True)
-    self._run_benchmark_cnn_with_black_and_white_images(params)
-  @mock.patch('tensorflow.compat.v1.train.Saver')
-  @mock.patch('benchmark_cnn._get_checkpoint_to_load')
-  def testLoadCheckpoint(self, mock_checkpoint_to_load, mock_saver):
-    """Tests load checkpoint with full path to checkpoint."""
-    expected_checkpoint = '/path/to/checkpoints/model.ckpt-1243'
-    mock_checkpoint_to_load.return_value = expected_checkpoint
-    global_batch = benchmark_cnn.load_checkpoint(mock_saver,
-                                                 None,
-                                                 expected_checkpoint)
-    self.assertEqual(global_batch, 1243)
-  def testGetCheckpointToLoadFullPath(self):
-    """Tests passing full path."""
-    ckpt_path = '/foo/bar/model.ckpt-189'
-    full_path = benchmark_cnn._get_checkpoint_to_load(ckpt_path)
-    self.assertEqual(full_path, ckpt_path)
-  def testGetCheckpointToLoadException(self):
-    """Tests exception for directory without a checkpoint."""
-    ckpt_path = '/foo/bar/checkpoints'
-    self.assertRaises(benchmark_cnn.CheckpointNotFoundException,
-                      benchmark_cnn._get_checkpoint_to_load, ckpt_path)
-  @mock.patch('tensorflow.compat.v1.train.get_checkpoint_state')
-  def testGetCheckpointToLoad(self, mock_checkpoint_state):
-    """Tests passing path to checkpoint folder."""
-    expected_checkpoint = '/path/to/checkpoints/model.ckpt-1243'
-    mock_checkpoint_state.return_value = mock.Mock(
-        model_checkpoint_path=expected_checkpoint)
-    ckpt_path = '/path/to/checkpoints/'
-    full_path = benchmark_cnn._get_checkpoint_to_load(ckpt_path)
-    self.assertEqual(full_path, expected_checkpoint)
-  def testImagenetPreprocessor(self):
-    imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
-                                'fake_tf_record_data')
-    params = test_util.get_params('testImagenetPreprocessor')._replace(
-        data_dir=imagenet_dir, data_name='imagenet')
-    self._train_and_eval_local(params, use_test_preprocessor=False)
-  def testImagenetPreprocessorNoDistortions(self):
-    imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
-                                'fake_tf_record_data')
-    params = test_util.get_params(
-        'testImagenetPreprocessorNoDistortions')._replace(
-            data_dir=imagenet_dir, data_name='imagenet', distortions=False)
-    self._train_and_eval_local(params, use_test_preprocessor=False)
-  def testImagenetPreprocessorVerboseSummary(self):
-    imagenet_dir = os.path.join(platforms_util.get_test_data_dir(),
-                                'fake_tf_record_data')
-    params = test_util.get_params(
-        'testImagenetPreprocessorVerboseSummary')._replace(
-            data_dir=imagenet_dir, data_name='imagenet', distortions=False,
-            summary_verbosity=2)
-    self._train_and_eval_local(params, use_test_preprocessor=False)
-  def testCifar10SyntheticData(self):
-    params = test_util.get_params('testCifar10SyntheticData')._replace(
-        data_name='cifar10')
-    self._train_and_eval_local(params)
-  def testShiftRatio(self):
-    test_util.monkey_patch_base_cluster_manager()
-    params = benchmark_cnn.make_params(
-        data_name='imagenet',
-        data_dir=os.path.join(platforms_util.get_test_data_dir(),
-                              'fake_tf_record_data'),
-        job_name='worker',
-        worker_hosts='w1,w2,w3,w4',
-        ps_hosts='p1',
-        task_index=0)
-    self.assertEqual(
-        benchmark_cnn.BenchmarkCNN(params).input_preprocessor.shift_ratio, 0.0)
-    params = params._replace(task_index=3)
-    self.assertEqual(
-        benchmark_cnn.BenchmarkCNN(params).input_preprocessor.shift_ratio, 0.75)
-  def testDistributedReplicatedSavableVars(self):
-    test_util.monkey_patch_base_cluster_manager()
-    params = benchmark_cnn.make_params(
-        variable_update='distributed_replicated',
-        model='inception4',
-        data_name='imagenet',
-        data_dir=os.path.join(platforms_util.get_test_data_dir(),
-                              'fake_tf_record_data'),
-        job_name='worker',
-        worker_hosts='w1,w2,w3,w4',
-        ps_hosts='p1',
-        datasets_use_prefetch=False)
-    bench = benchmark_cnn.BenchmarkCNN(params)
-    with tf.Graph().as_default():
-      bench._build_model()
-      savable_vars = bench.variable_mgr.savable_variables()
-      # Assert all global variables are in savable_vars
-      for v in tf.global_variables():
-        if not v.name.startswith(
-            variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0'):
-          self.assertEqual(v.name, 'global_step:0')
-        name = bench.variable_mgr._strip_port(v.name)
-        if name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX):
-          name = name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):]
-        self.assertIn(name, savable_vars)
-        self.assertIn(savable_vars[name], tf.global_variables())
-      # Assert all local variables on the first tower are in savable_vars
-      for v in tf.local_variables():
-        if v.name.startswith('v0/'):
-          name = bench.variable_mgr._strip_port(v.name)
-          self.assertIn(name, savable_vars)
-  def _test_preprocessing_eval(self, image_height, image_width, output_height,
-                               output_width):
-    image = tf.fill((image_height, image_width, 3),
-                    tf.constant(128, dtype=tf.uint8))
-    params = benchmark_cnn.make_params()
-    new_image = preprocessing.eval_image(image, output_height, output_width, 0,
-                                         'bilinear', params.summary_verbosity)
-    with self.test_session() as sess:
-      new_image_value = sess.run(new_image)
-    self.assertAllEqual(new_image_value,
-                        np.full((output_height, output_width, 3), 128,
-                                dtype=np.uint8))
-  def testPreprocessingEval(self):
-    self._test_preprocessing_eval(10, 10, 4, 4)
-    self._test_preprocessing_eval(4, 4, 10, 10)
-    self._test_preprocessing_eval(1, 100, 100, 1)
-    self._test_preprocessing_eval(100, 1, 1, 100)
-    self._test_preprocessing_eval(1, 100, 1, 100)
-  def _test_preprocessing_traing(self, image_buf, image_color,
-                                 output_height, output_width, bbox,
-                                 batch_position, resize_method, distortions,
-                                 summary_verbosity, fuse_decode_and_crop):
-    new_image = preprocessing.train_image(
-        image_buf,
-        output_height,
-        output_width,
-        bbox,
-        batch_position,
-        resize_method,
-        distortions,
-        summary_verbosity=summary_verbosity,
-        fuse_decode_and_crop=fuse_decode_and_crop)
-    self.assertEqual(new_image.shape, [output_height, output_width, 3])
-    with self.test_session(use_gpu=True) as sess:
-      new_image_value = sess.run(new_image)
-    self.assertAllClose(
-        new_image_value,
-        np.full(
-            [output_height, output_width, 3],
-            image_color,
-            dtype=np.float32),
-        atol=50.,
-        rtol=0.)
-  def testPreprocessingTrain(self):
-    test_data_dir = os.path.join(platforms_util.get_test_data_dir(), 'images')
-    black_file = os.path.join(test_data_dir, 'black_image.jpg')
-    with open(black_file, 'rb') as f:
-      black_jpg_buffer = f.read()
-    white_file = os.path.join(test_data_dir, 'white_image.jpg')
-    with open(white_file, 'rb') as f:
-      white_jpg_buffer = f.read()
-    bbox = tf.zeros((1, 0, 4), dtype=tf.float32)
-    batch_position = 0
-    # Each size config is (output_height, output_width, resize_method)
-    size_configs = [(100, 100, 'round_robin'), (150, 10, 'bilinear'),
-                    (10, 150, 'nearest')]
-    # Each image config is (image_buf, image_color)
-    image_configs = [(white_jpg_buffer, 255), (black_jpg_buffer, 0)]
-    for (image_buf, image_color) in image_configs:
-      for output_height, output_width, resize_method in size_configs:
-        for distortions in [True, False]:
-          for summary_verbosity in [0, 2]:
-            for fuse_decode_and_crop in [True, False]:
-              self._test_preprocessing_traing(
-                  image_buf, image_color, output_height, output_width, bbox,
-                  batch_position, resize_method, distortions, summary_verbosity,
-                  fuse_decode_and_crop)
-  def _test_learning_rate(self, params, global_step_to_expected_learning_rate):
-    self.longMessage = True  # pylint: disable=invalid-name
-    bench = benchmark_cnn.BenchmarkCNN(params)
-    with tf.Graph().as_default() as graph:
-      bench._build_model()
-      global_step = graph.get_tensor_by_name('global_step:0')
-      learning_rate = graph.get_tensor_by_name('learning_rate_tensor:0')
-      with self.test_session(graph=graph, use_gpu=True) as sess:
-        items = global_step_to_expected_learning_rate.items()
-        for global_step_val, expected_learning_rate in items:
-          self.assertAlmostEqual(sess.run(learning_rate,
-                                          {global_step: global_step_val}),
-                                 expected_learning_rate,
-                                 msg='at global_step:{}'.
-                                 format(global_step_val))
-  def testLearningRateModelSpecificResNet(self):
-    params = benchmark_cnn.make_params(model='resnet50',
-                                       batch_size=256,
-                                       variable_update='parameter_server',
-                                       num_gpus=1)
-    self._test_learning_rate(params, {
-        0: 0,
-        150136: 0.128,
-        150137: 0.0128,
-        300273: 0.0128,
-        300274: 0.00128,
-        10000000: 0.0000128
-    })
-  def testLearningRateUserProvidedInitLr(self):
-    params = benchmark_cnn.make_params(model='resnet50',
-                                       batch_size=256,
-                                       variable_update='replicated',
-                                       init_learning_rate=1.)
-    self._test_learning_rate(params, {
-        0: 1.,
-        10000000: 1.
-    })
-  def testLearningRateUserProvidedInitLrAndWarmup(self):
-    params = benchmark_cnn.make_params(model='resnet50',
-                                       batch_size=256,
-                                       variable_update='replicated',
-                                       init_learning_rate=1.,
-                                       num_learning_rate_warmup_epochs=5)
-    self._test_learning_rate(params, {
-        0: 0.,
-        12511: 0.5,
-        25022: 1.,
-        10000000: 1.
-    })
-  def testLearningRateUserProvidedDecayInfo(self):
-    params = benchmark_cnn.make_params(model='resnet50',
-                                       init_learning_rate=1.,
-                                       learning_rate_decay_factor=0.5,
-                                       num_epochs_per_decay=2,
-                                       minimum_learning_rate=0.3750,
-                                       batch_size=32)
-    self._test_learning_rate(params, {
-        0: 1.,
-        80071: 1.,
-        80072: 0.5,
-        160143: 0.5,
-        160144: 0.375,
-        10000000: 0.375
-    })
-  def testLearningRateUserProvidedZeroDecay(self):
-    params = benchmark_cnn.make_params(model='resnet50',
-                                       num_learning_rate_warmup_epochs=0,
-                                       learning_rate_decay_factor=0.5,
-                                       num_epochs_per_decay=0,
-                                       minimum_learning_rate=0.3750,
-                                       batch_size=32)
-    with self.assertRaises(ValueError):
-      with tf.Graph().as_default():
-        # This will fail because params.learning_rate_decay_factor cannot be
-        # nonzero if params.num_epochs_per_decay is zero.
-        benchmark_cnn.BenchmarkCNN(params)._build_model()
-  def testLearningRateUserProvidedSchedule(self):
-    params = benchmark_cnn.make_params(
-        model='trivial',
-        batch_size=32,
-        piecewise_learning_rate_schedule='1;3;.1;5;.01')
-    self._test_learning_rate(params, {
-        0: 1.,
-        120108: 1.,
-        120109: 0.1,
-        200181: 0.1,
-        200182: 0.01,
-        100000000: 0.01
-    })
-  def testNumBatchesAndEpochs(self):
-    params = benchmark_cnn.make_params()
-    batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 10, 100)
-    self.assertEqual(batches, benchmark_cnn._DEFAULT_NUM_BATCHES)
-    self.assertAlmostEqual(epochs,
-                           float(benchmark_cnn._DEFAULT_NUM_BATCHES) / 10)
-    params = benchmark_cnn.make_params(num_batches=21)
-    batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 25, 50)
-    self.assertEqual(batches, 21)
-    self.assertAlmostEqual(epochs, 10.5)
-    params = benchmark_cnn.make_params(num_epochs=3)
-    batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 2, 3)
-    self.assertEqual(batches, 5)
-    self.assertAlmostEqual(epochs, 10./3.)
-    params = benchmark_cnn.make_params(num_epochs=4)
-    batches, epochs = benchmark_cnn.get_num_batches_and_epochs(params, 2, 3)
-    self.assertEqual(batches, 6)
-    self.assertAlmostEqual(epochs, 4)
-    with self.assertRaises(ValueError):
-      params = benchmark_cnn.make_params(num_batches=100, num_epochs=100)
-      benchmark_cnn.get_num_batches_and_epochs(params, 1, 1)
-  def _testEvalDuringTraining(self, params, expected_num_eval_batches_found):
-    # The idea of this test is that all train images are black and all eval
-    # images are white. We pass the images through the TestModel, and ensure
-    # the outputs are as expected.
-    batch_size = params.batch_size
-    eval_batch_size = params.eval_batch_size or params.batch_size
-    class TestModel(test_util.TestCNNModel):
-      def __init__(self):
-        super(TestModel, self).__init__()
-        self.depth = 3
-      def add_inference(self, cnn):
-        if cnn.phase_train:
-          # This will allow us to test that 100 is only added during training
-          # and not during eval.
-          cnn.top_layer += 100
-          assert cnn.top_layer.shape[0] == batch_size
-        else:
-          assert cnn.top_layer.shape[0] == eval_batch_size
-        # Reduce the image to a single number. The number should be (-1 + 100)
-        # during training and 1 during testing.
-        cnn.top_layer = tf.reshape(cnn.top_layer, (cnn.top_layer.shape[0], -1))
-        cnn.top_layer = tf.reduce_mean(cnn.top_layer, axis=1)
-        cnn.top_layer = tf.reshape(cnn.top_layer,
-                                   (cnn.top_layer.shape[0], 1, 1, 1))
-        cnn.top_size = 1
-        trainable_vars = tf.trainable_variables()
-        # The super method will compute image*A*B, where A=1 and B=2.
-        super(TestModel, self).add_inference(cnn)
-        if not cnn.phase_train:
-          # Assert no new variables were added, since they should be reused from
-          # training.
-          assert len(trainable_vars) == len(tf.trainable_variables())
-    model = TestModel()
-    dataset = datasets.ImagenetDataset(params.data_dir)
-    logs = []
-    bench_cnn = benchmark_cnn.BenchmarkCNN(params, model=model, dataset=dataset)
-    with test_util.monkey_patch(benchmark_cnn,
-                                log_fn=test_util.print_and_add_to_list(logs)):
-      bench_cnn.run()
-    training_outputs = test_util.get_training_outputs_from_logs(
-        logs, print_training_accuracy=False)
-    self.assertEqual(len(training_outputs), params.num_batches)
-    expected_training_output = (-1 + 100) * 1 * 2
-    for training_output in training_outputs:
-      self.assertEqual(training_output.loss, expected_training_output)
-    eval_outputs = test_util.get_evaluation_outputs_from_logs(logs)
-    self.assertTrue(eval_outputs)
-    expected_eval_output = 1 * 1 * 2
-    for eval_output in eval_outputs:
-      self.assertEqual(eval_output.top_1_accuracy, expected_eval_output)
-      self.assertEqual(eval_output.top_5_accuracy, expected_eval_output)
-    num_eval_batches_found = 0
-    eval_batch_regex = re.compile(r'^\d+\t[0-9.]+ examples/sec$')
-    for log in logs:
-      if eval_batch_regex.match(log):
-        num_eval_batches_found += 1
-    self.assertEqual(num_eval_batches_found, expected_num_eval_batches_found)
-  def testEvalDuringTraining(self):
-    data_dir = test_util.create_black_and_white_images()
-    base_params = test_util.get_params('testEvalDuringTraining')
-    train_dir = base_params.train_dir
-    base_params = base_params._replace(
-        train_dir=None, print_training_accuracy=False, num_warmup_batches=0,
-        num_batches=7, num_eval_batches=2, display_every=1,
-        init_learning_rate=0, weight_decay=0,
-        distortions=False, data_dir=data_dir)
-    expected_num_eval_batches_found = (
-        base_params.num_eval_batches * (base_params.num_batches // 2 + 1))
-    # Test --eval_during_training_every_n_steps
-    self._testEvalDuringTraining(
-        base_params._replace(eval_during_training_every_n_steps=2,
-                             variable_update='parameter_server'),
-        expected_num_eval_batches_found)
-    self._testEvalDuringTraining(
-        base_params._replace(eval_during_training_every_n_steps=2,
-                             variable_update='replicated'),
-        expected_num_eval_batches_found)
-    self._testEvalDuringTraining(
-        base_params._replace(eval_during_training_every_n_steps=2,
-                             variable_update='replicated',
-                             summary_verbosity=2,
-                             save_summaries_steps=2,
-                             datasets_use_prefetch=False),
-        expected_num_eval_batches_found)
-    self._testEvalDuringTraining(
-        base_params._replace(eval_during_training_every_n_steps=2,
-                             variable_update='replicated',
-                             use_fp16=True, train_dir=train_dir,
-                             eval_batch_size=base_params.batch_size + 2),
-        expected_num_eval_batches_found)
-    # Test --eval_during_training_every_n_epochs
-    every_n_epochs = (2 * base_params.batch_size * base_params.num_gpus /
-                      datasets.IMAGENET_NUM_TRAIN_IMAGES)
-    self._testEvalDuringTraining(
-        base_params._replace(eval_during_training_every_n_epochs=every_n_epochs,
-                             variable_update='replicated'),
-        expected_num_eval_batches_found)
-    # Test --eval_during_training_at_specified_steps
-    list_steps = [2, 3, 5, 7, 1000]
-    num_eval_steps = 1 + sum(1 for step in list_steps
-                             if step < base_params.num_batches)
-    expected_num_eval_batches_found = (
-        base_params.num_eval_batches * num_eval_steps)
-    self._testEvalDuringTraining(
-        base_params._replace(eval_during_training_at_specified_steps=list_steps,
-                             variable_update='replicated'),
-        expected_num_eval_batches_found)
-    # Test --eval_during_training_at_specified_epochs
-    list_epochs = [(step * base_params.batch_size * base_params.num_gpus /
-                    datasets.IMAGENET_NUM_TRAIN_IMAGES)
-                   for step in list_steps]
-    self._testEvalDuringTraining(
-        base_params._replace(
-            eval_during_training_at_specified_epochs=list_epochs,
-            variable_update='replicated'),
-        expected_num_eval_batches_found)
-    # Test --eval_during_training_every_n_steps runs with synthetic data.
-    params = base_params._replace(
-        variable_update='replicated', data_dir=None,
-        eval_during_training_every_n_steps=2, num_batches=2)
-    benchmark_cnn.BenchmarkCNN(params).run()
-  def testEvalDuringTrainingNumEpochs(self):
-    params = benchmark_cnn.make_params(
-        batch_size=1, eval_batch_size=2, eval_during_training_every_n_steps=1,
-        num_batches=30, num_eval_epochs=100 / datasets.IMAGENET_NUM_VAL_IMAGES)
-    bench_cnn = benchmark_cnn.BenchmarkCNN(params)
-    self.assertEqual(bench_cnn.num_batches, 30)
-    self.assertAlmostEqual(bench_cnn.num_epochs,
-                           30 / datasets.IMAGENET_NUM_TRAIN_IMAGES)
-    self.assertAlmostEqual(bench_cnn.num_eval_batches, 50)
-    self.assertAlmostEqual(bench_cnn.num_eval_epochs,
-                           100 / datasets.IMAGENET_NUM_VAL_IMAGES)
-  def testEarlyStopping(self):
-    params = benchmark_cnn.make_params(
-        batch_size=2,
-        display_every=1,
-        num_batches=100,
-        eval_during_training_every_n_steps=2,
-        stop_at_top_1_accuracy=0.4,
-    )
-    with mock.patch.object(benchmark_cnn.BenchmarkCNN, '_eval_once',
-                           side_effect=[(0.1, 0.1), (0.5, 0.5), (0.2, 0.2)]
-                          ) as mock_eval_once:
-      logs = []
-      bench_cnn = benchmark_cnn.BenchmarkCNN(params)
-      with test_util.monkey_patch(benchmark_cnn,
-                                  log_fn=test_util.print_and_add_to_list(logs)):
-        bench_cnn.run()
-      training_outputs = test_util.get_training_outputs_from_logs(
-          logs, print_training_accuracy=False)
-      # We should stop after the second evaluation, and we evaluate every 2
-      # steps. So there should be 2 * 2 = 4 training outputs.
-      self.assertEqual(len(training_outputs), 4)
-      self.assertEqual(mock_eval_once.call_count, 2)
-  def testOutOfRangeErrorsAreNotIgnored(self):
-    error_msg = 'Fake OutOfRangeError error message'
-    with mock.patch.object(benchmark_cnn.BenchmarkCNN, 'benchmark_with_session',
-                           side_effect=tf.errors.OutOfRangeError(None, None,
-                                                                 error_msg)):
-      with self.assertRaisesRegex(RuntimeError, error_msg):
-        benchmark_cnn.BenchmarkCNN(benchmark_cnn.make_params()).run()
-  def testInvalidFlags(self):
-    params = benchmark_cnn.make_params(device='cpu', data_format='NCHW')
-    with self.assertRaises(ValueError):
-      benchmark_cnn.BenchmarkCNN(params)
-    params = benchmark_cnn.make_params(use_fp16=True, fp16_vars=True,
-                                       variable_update='replicated',
-                                       all_reduce_spec='nccl')
-    with self.assertRaises(ValueError):
-      benchmark_cnn.BenchmarkCNN(params)
-    # Automatic loss scaling is only supported for 'replicated', 'ps',
-    # and 'independent' variable_updates.
-    invalid_variable_updates = [
-        'distributed_replicated', 'distributed_all_reduce'
-    ]
-    for variable_update in invalid_variable_updates:
-      params = benchmark_cnn.make_params(
-          use_fp16=True,
-          fp16_vars=True,
-          fp16_enable_auto_loss_scale=True,
-          variable_update=variable_update)
-      with self.assertRaises(ValueError):
-        benchmark_cnn.BenchmarkCNN(params)
-    # Automatic loss scaling is not supported for 'nccl'.
-    params = benchmark_cnn.make_params(
-        use_fp16=True,
-        fp16_vars=True,
-        fp16_enable_auto_loss_scale=True,
-        all_reduce_spec='nccl')
-    with self.assertRaises(ValueError):
-      benchmark_cnn.BenchmarkCNN(params)
-    # Automatic loss scaling is not supported for 'staged_vars'.
-    params = benchmark_cnn.make_params(
-        use_fp16=True,
-        fp16_vars=True,
-        fp16_enable_auto_loss_scale=True,
-        staged_vars=True)
-    with self.assertRaises(ValueError):
-      benchmark_cnn.BenchmarkCNN(params)
-  def testMakeParams(self):
-    default_params = benchmark_cnn.make_params()
-    self.assertEqual(default_params.model,
-                     flags.param_specs['model'].default_value)
-    params = benchmark_cnn.make_params(model='foo')
-    self.assertEqual(params.model, 'foo')
-    with self.assertRaises(ValueError):
-      benchmark_cnn.make_params(job_name='foo')
-    with self.assertRaises(ValueError):
-      benchmark_cnn.make_params(gpu_memory_frac_for_testing=-1.)
-class VariableUpdateTest(tf.test.TestCase):
-  """Tests that variables are updated correctly.
-  These tests use a very simple deterministic model. For example, some tests use
-  the model
-    loss = image * A * B
-  where image is a 1x1 images (with a single scalar value), and A and B are
-  scalar variables. Tests will run tf_cnn_benchmarks with such a model, on a
-  sequence of scalar images, and assert that the losses are the correct value.
-  Since the losses depend on the variables, this indirectly tests variables are
-  updated correctly.
-  """
-  def setUp(self):
-    super(VariableUpdateTest, self).setUp()
-    _check_has_gpu()
-    benchmark_cnn.setup(benchmark_cnn.make_params())
-  def _get_benchmark_cnn_losses(self, inputs, params):
-    """Returns the losses of BenchmarkCNN on the given inputs and params."""
-    logs = []
-    model = test_util.TestCNNModel()
-    with test_util.monkey_patch(benchmark_cnn,
-                                log_fn=test_util.print_and_add_to_list(logs),
-                                LOSS_AND_ACCURACY_DIGITS_TO_SHOW=15):
-      bench = benchmark_cnn.BenchmarkCNN(
-          params, dataset=test_util.TestDataSet(), model=model)
-      # The test model does not use labels when computing loss, so the label
-      # values do not matter as long as it's the right shape.
-      labels = np.array([1] * inputs.shape[0])
-      bench.input_preprocessor.set_fake_data(inputs, labels)
-      if bench.eval_input_preprocessor:
-        bench.eval_input_preprocessor.set_fake_data(inputs, labels)
-      bench.run()
-    outputs = test_util.get_training_outputs_from_logs(
-        logs, params.print_training_accuracy)
-    return [x.loss for x in outputs]
-  def _test_variable_update(self, params):
-    """Tests variables are updated correctly when the given params are used.
-    A BenchmarkCNN is created with a TestCNNModel, and is run with some scalar
-    images. The losses are then compared with the losses obtained with
-    TestCNNModel().manually_compute_losses()
-    Args:
-      params: a Params tuple used to create BenchmarkCNN.
-    """
-    inputs = test_util.get_fake_var_update_inputs()
-    actual_losses = self._get_benchmark_cnn_losses(inputs, params)
-    expected_losses, = test_util.TestCNNModel().manually_compute_losses(
-        inputs, 1, params)
-    rtol = 3e-2 if params.use_fp16 else 1e-5
-    self.assertAllClose(actual_losses[:len(expected_losses)], expected_losses,
-                        rtol=rtol, atol=0.)
-  def _test_variable_updates(self, params,
-                             var_updates=('parameter_server', 'replicated')):
-    for var_update in var_updates:
-      self._test_variable_update(params._replace(variable_update=var_update))
-  def testDefault(self):
-    params = test_util.get_var_update_params()
-    self._test_variable_updates(params)
-  # For some reason, this test doesn't always pass
-  # def testCpuAsDevice(self):
-  #   params = test_util.get_var_update_params()._replace(
-  #       device='cpu',
-  #       data_format='NHWC')  # NHWC required when --device=cpu
-  #   self._test_variable_updates(params)
-  def testCpuAsLocalParamDevice(self):
-    params = test_util.get_var_update_params()._replace(
-        local_parameter_device='cpu')
-    self._test_variable_updates(params)
-  def testFp16(self):
-    params = test_util.get_var_update_params()._replace(use_fp16=True)
-    self._test_variable_updates(params)
-  def testMomentum(self):
-    params = test_util.get_var_update_params()._replace(optimizer='momentum')
-    self._test_variable_updates(params)
-  def testRmsprop(self):
-    params = test_util.get_var_update_params()._replace(optimizer='rmsprop')
-    self._test_variable_updates(params)
-  def testNoLayers(self):
-    params = test_util.get_var_update_params()._replace(use_tf_layers=False)
-    self._test_variable_updates(params)
-  def testVariousAllReduceSpecs(self):
-    # We do not test xring, because it requires all Variables to have at least
-    # two elements.
-    params = test_util.get_var_update_params()._replace(all_reduce_spec='pscpu')
-    self._test_variable_updates(params, var_updates=('replicated',))
-    params = params._replace(all_reduce_spec='psgpu')
-    self._test_variable_updates(params, var_updates=('replicated',))
-    # TODO(b/80125832): Enable nccl in tests
-    # params = params._replace(all_reduce_spec='nccl',
-    #                          compact_gradient_transfer=False)
-    # self._test_variable_updates(params, var_updates=('replicated',))
-  def testPrintBaseLoss(self):
-    params = test_util.get_var_update_params()._replace(
-        loss_type_to_report='base_loss')
-    self._test_variable_updates(params)
-  def testSingleL2LossOp(self):
-    params = test_util.get_var_update_params()._replace(
-        single_l2_loss_op=True)
-    self._test_variable_updates(params)
-  def testResourceVars(self):
-    params = test_util.get_var_update_params()._replace(
-        use_resource_vars=True)
-    self._test_variable_updates(params)
-  def testEvalDuringTrainingEveryNSteps(self):
-    # TODO(reedwm): Test that the eval results are correct. This only tests that
-    # training results are correct.
-    params = test_util.get_var_update_params()._replace(
-        eval_during_training_every_n_steps=1)
-    self._test_variable_updates(params, var_updates=('replicated',))
-class VariableMgrLocalReplicatedTest(tf.test.TestCase):
-  def _test_grad_aggregation_with_var_mgr(self, variable_mgr, num_towers,
-                                          num_vars, deferred_grads):
-    tower_devices = ['/gpu:%d' % i for i in range(num_towers)]
-    tower_grads = []
-    expected_sums = [0.] * num_vars
-    for i, tower_device in enumerate(tower_devices):
-      with tf.device(tower_device):
-        grad_vars = []
-        for j in range(num_vars):
-          n = num_towers * i + j
-          grad_vars.append((tf.constant(n, dtype=tf.float32),
-                            tf.Variable(n, dtype=tf.float32)))
-          expected_sums[j] += n
-      tower_grads.append(grad_vars)
-    _, agg_device_grads = variable_mgr.preprocess_device_grads(
-        tower_grads)
-    expected_device_grads = []
-    for i in range(num_towers):
-      expected_grad_vars = []
-      for j in range(num_vars):
-        expected_grad_and_var = [expected_sums[j], num_towers * i + j]
-        if isinstance(agg_device_grads[i][j], tuple):
-          # agg_device_grads[i][j] can be a list or tuple.
-          expected_grad_and_var = tuple(expected_grad_and_var)
-        expected_grad_vars.append(expected_grad_and_var)
-      if isinstance(agg_device_grads[i], tuple):
-        # agg_device_grads[i] can be a list or tuple.
-        expected_grad_vars = tuple(expected_grad_vars)
-      expected_device_grads.append(expected_grad_vars)
-    config = tf.ConfigProto(allow_soft_placement=True)
-    with tf.Session(config=config) as sess:
-      sess.run(tf.initialize_all_variables())
-      sess.run(variable_mgr._warmup_ops)
-      if deferred_grads:
-        # With deferred grads, the result of a session run is always the summed
-        # gradients from the previous session run.
-        sess.run(agg_device_grads)
-        feed_dict = {g: 0 for grad_vars in tower_grads for g, _ in grad_vars}
-        agg_device_grads_ = sess.run(agg_device_grads, feed_dict)
-      else:
-        agg_device_grads_ = sess.run(agg_device_grads)
-    self.assertEqual(agg_device_grads_, expected_device_grads)
-  def _test_grad_aggregation(self, params, num_vars):
-    bench = benchmark_cnn.BenchmarkCNN(params)
-    deferred_grads = (params.variable_consistency == 'relaxed')
-    self._test_grad_aggregation_with_var_mgr(bench.variable_mgr, bench.num_gpus,
-                                             num_vars, deferred_grads)
-  def test_grad_aggregation(self):
-    base_params = benchmark_cnn.make_params(num_gpus=10,
-                                            variable_update='replicated',
-                                            use_fp16=True)
-    params = base_params
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(gradient_repacking=3)
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(variable_consistency='relaxed')
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(compact_gradient_transfer=False)
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(gradient_repacking=3,
-                                  variable_consistency='relaxed')
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(gradient_repacking=3,
-                                  compact_gradient_transfer=False)
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(variable_consistency='relaxed',
-                                  compact_gradient_transfer=False)
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(gradient_repacking=3,
-                                  variable_consistency='relaxed',
-                                  compact_gradient_transfer=False)
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(num_gpus=8, hierarchical_copy=True)
-    self._test_grad_aggregation(params, 10)
-    # TODO(b/80125832): Enable nccl in tests
-    # params = base_params._replace(all_reduce_spec='nccl',
-    #                               compact_gradient_transfer=False,
-    #                               # For some reason, this test freezes when
-    #                               # num_gpus=10
-    #                               num_gpus=8)
-    # self._test_grad_aggregation(params, 10)
-    params = base_params._replace(all_reduce_spec='pscpu')
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(num_gpus=8,
-                                  gradient_repacking=3,
-                                  variable_consistency='relaxed',
-                                  hierarchical_copy=True)
-    self._test_grad_aggregation(params, 10)
-    # TODO(b/80125832): Enable nccl in tests
-    # params = base_params._replace(num_gpus=8,
-    #                               gradient_repacking=3,
-    #                               variable_consistency='relaxed',
-    #                               all_reduce_spec='nccl',
-    #                               compact_gradient_transfer=False)
-    # self._test_grad_aggregation(params, 10)
-    params = base_params._replace(gradient_repacking=3,
-                                  variable_consistency='relaxed',
-                                  all_reduce_spec='pscpu')
-    self._test_grad_aggregation(params, 10)
-    params = base_params._replace(gradient_repacking=3,
-                                  variable_consistency='relaxed',
-                                  all_reduce_spec='xring')
-    self._test_grad_aggregation(params, 10)
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  tf.test.main()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/cnn_util.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/cnn_util.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for CNN benchmarks."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import sys
-import threading
-import numpy as np
-import tensorflow.compat.v1 as tf
-def tensorflow_version_tuple():
-  v = tf.__version__
-  major, minor, patch = v.split('.')
-  return (int(major), int(minor), patch)
-def tensorflow_version():
-  vt = tensorflow_version_tuple()
-  return vt[0] * 1000 + vt[1]
-def log_fn(log):
-  print(log)
-def roll_numpy_batches(array, batch_size, shift_ratio):
-  """Moves a proportion of batches from start to the end of the array.
-  This function moves a proportion of batches, specified by `shift_ratio`, from
-  the starts of the array to the end. The number of batches moved is rounded
-  down to the nearest integer. For example,
-  ```
-  roll_numpy_batches([1, 2, 3, 4, 5, 6], 2, 0.34) == [3, 4, 5, 6, 1, 2]
-  ```
-  Args:
-    array: A Numpy array whose first dimension is the batch dimension.
-    batch_size: The batch size.
-    shift_ratio: Proportion of batches to move from the start of the array to
-      the end of the array.
-  Returns:
-    A new Numpy array, with a proportion of the batches at the start of `array`
-    moved to the end.
-  """
-  num_items = array.shape[0]
-  assert num_items % batch_size == 0
-  num_batches = num_items // batch_size
-  starting_batch = int(num_batches * shift_ratio)
-  starting_item = starting_batch * batch_size
-  return np.roll(array, -starting_item, axis=0)
-# For Python 2.7 compatibility, we do not use threading.Barrier.
-class Barrier(object):
-  """Implements a lightweight Barrier.
-  Useful for synchronizing a fixed number of threads at known synchronization
-  points.  Threads block on 'wait()' and simultaneously return once they have
-  all made that call.
-  # Implementation adopted from boost/thread/barrier.hpp
-  """
-  def __init__(self, parties):
-    """Create a barrier, initialised to 'parties' threads."""
-    self.cond = threading.Condition(threading.Lock())
-    self.parties = parties
-    # Indicates the number of waiting parties.
-    self.waiting = 0
-    # generation is needed to deal with spurious wakeups. If self.cond.wait()
-    # wakes up for other reasons, generation will force it go back to wait().
-    self.generation = 0
-    self.broken = False
-  def wait(self):
-    """Wait for the barrier."""
-    with self.cond:
-      # Check if the barrier has been disabled or not.
-      if self.broken:
-        return
-      gen = self.generation
-      self.waiting += 1
-      if self.waiting == self.parties:
-        self.waiting = 0
-        self.generation += 1
-        self.cond.notify_all()
-      # loop because of spurious wakeups
-      while gen == self.generation:
-        self.cond.wait()
-  # TODO(huangyp): Remove this method once we find a way to know which step
-  # is the last barrier.
-  def abort(self):
-    """Clear existing barrier and disable this barrier."""
-    with self.cond:
-      if self.waiting > 0:
-        self.generation += 1
-        self.cond.notify_all()
-      self.broken = True
-class ImageProducer(object):
-  """An image producer that puts images into a staging area periodically.
-  This class is useful for periodically running a set of ops, `put_ops` on a
-  different thread every `batch_group_size` steps.
-  The notify_image_consumption() method is used to increment an internal counter
-  so that every `batch_group_size` times it is called, `put_ops` is executed. A
-  barrier is placed so that notify_image_consumption() will block until
-  the previous call to `put_ops` has been executed.
-  The start() method is used to start the thread that runs `put_ops`.
-  The done() method waits until the last put_ops is executed and stops the
-  thread.
-  The purpose of this class is to fill an image input pipeline every
-  `batch_group_size` steps. Suppose `put_ops` supplies `batch_group_size` images
-  to the input pipeline when run, and that every step, 1 batch of images is
-  consumed. Then, by calling notify_image_consumption() every step, images are
-  supplied to the input pipeline at the same amount they are consumed.
-  Example usage:
-  ```
-  put_ops = ... # Enqueues `batch_group_size` batches to a StagingArea
-  get_op = ...  # Dequeues 1 batch, and does some operations on it
-  batch_group_size = 4
-  with tf.Session() as sess:
-    image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size)
-    image_producer.start()
-    for _ in range(100):
-      sess.run(get_op)
-      image_producer.notify_image_consumption()
-  ```
-  """
-  def __init__(self, sess, put_ops, batch_group_size, use_python32_barrier):
-    self.sess = sess
-    self.num_gets = 0
-    self.put_ops = put_ops
-    self.batch_group_size = batch_group_size
-    self.done_event = threading.Event()
-    if (use_python32_barrier and
-        sys.version_info[0] == 3 and sys.version_info[1] >= 2):
-      self.put_barrier = threading.Barrier(2)
-    else:
-      self.put_barrier = Barrier(2)
-  def _should_put(self):
-    return (self.num_gets + 1) % self.batch_group_size == 0
-  def done(self):
-    """Stop the image producer."""
-    self.done_event.set()
-    self.put_barrier.abort()
-    self.thread.join()
-  def start(self):
-    """Start the image producer."""
-    self.sess.run([self.put_ops])
-    self.thread = threading.Thread(target=self._loop_producer)
-    # Set daemon to true to allow Ctrl + C to terminate all threads.
-    self.thread.daemon = True
-    self.thread.start()
-  def notify_image_consumption(self):
-    """Increment the counter of image_producer by 1.
-    This should only be called by the main thread that consumes images and runs
-    the model computation. One batch of images should be consumed between
-    calling start() and the first call to this method. Then, one batch of images
-    should be consumed between any two successive calls to this method.
-    """
-    if self._should_put():
-      self.put_barrier.wait()
-    self.num_gets += 1
-  def _loop_producer(self):
-    while not self.done_event.isSet():
-      self.sess.run([self.put_ops])
-      self.put_barrier.wait()
-class BaseClusterManager(object):
-  """The manager for the cluster of servers running the benchmark."""
-  def __init__(self, params):
-    worker_hosts = params.worker_hosts.split(',')
-    ps_hosts = params.ps_hosts.split(',') if params.ps_hosts else []
-    cluster = {'worker': worker_hosts}
-    if ps_hosts:
-      cluster['ps'] = ps_hosts
-    self._cluster_spec = tf.train.ClusterSpec(cluster)
-  def get_target(self):
-    """Returns a target to be passed to tf.Session()."""
-    raise NotImplementedError('get_target must be implemented by subclass')
-  def join_server(self):
-    raise NotImplementedError('join must be implemented by subclass')
-  def get_cluster_spec(self):
-    return self._cluster_spec
-  def num_workers(self):
-    return len(self._cluster_spec.job_tasks('worker'))
-  def num_ps(self):
-    if 'ps' in self._cluster_spec.jobs:
-      return len(self._cluster_spec.job_tasks('ps'))
-    else:
-      return 0
-class GrpcClusterManager(BaseClusterManager):
-  """A cluster manager for a cluster networked with gRPC."""
-  def __init__(self, params, config_proto):
-    super(GrpcClusterManager, self).__init__(params)
-    if params.job_name == 'controller':
-      self._target = 'grpc://%s' % self._cluster_spec.job_tasks('worker')[0]
-    else:
-      self._server = tf.train.Server(self._cluster_spec,
-                                     job_name=params.job_name,
-                                     task_index=params.task_index,
-                                     config=config_proto,
-                                     protocol=params.server_protocol)
-      self._target = self._server.target
-  def get_target(self):
-    return self._target
-  def join_server(self):
-    return self._server.join()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/cnn_util_test.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/cnn_util_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf_cnn_benchmarks.cnn_util."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import threading
-import time
-import tensorflow.compat.v1 as tf
-import cnn_util
-class CnnUtilBarrierTest(tf.test.TestCase):
-  def testBarrier(self):
-    num_tasks = 20
-    num_waits = 4
-    barrier = cnn_util.Barrier(num_tasks)
-    threads = []
-    sync_matrix = []
-    for i in range(num_tasks):
-      sync_times = [0] * num_waits
-      thread = threading.Thread(
-          target=self._run_task, args=(barrier, sync_times))
-      thread.start()
-      threads.append(thread)
-      sync_matrix.append(sync_times)
-    for thread in threads:
-      thread.join()
-    for wait_index in range(num_waits - 1):
-      # Max of times at iteration i < min of times at iteration i + 1
-      self.assertLessEqual(
-          max([sync_matrix[i][wait_index] for i in range(num_tasks)]),
-          min([sync_matrix[i][wait_index + 1] for i in range(num_tasks)]))
-  def _run_task(self, barrier, sync_times):
-    for wait_index in range(len(sync_times)):
-      sync_times[wait_index] = time.time()
-      barrier.wait()
-  def testBarrierAbort(self):
-    num_tasks = 2
-    num_waits = 1
-    sync_times = [0] * num_waits
-    barrier = cnn_util.Barrier(num_tasks)
-    thread = threading.Thread(
-        target=self._run_task, args=(barrier, sync_times))
-    thread.start()
-    barrier.abort()
-    # thread won't be blocked by done barrier.
-    thread.join()
-class ImageProducerTest(tf.test.TestCase):
-  def _slow_tensorflow_op(self):
-    """Returns a TensorFlow op that takes approximately 0.1s to complete."""
-    def slow_func(v):
-      time.sleep(0.1)
-      return v
-    return tf.py_func(slow_func, [tf.constant(0.)], tf.float32).op
-  def _test_image_producer(self, batch_group_size, put_slower_than_get):
-    # We use the variable x to simulate a staging area of images. x represents
-    # the number of batches in the staging area.
-    x = tf.Variable(0, dtype=tf.int32)
-    if put_slower_than_get:
-      put_dep = self._slow_tensorflow_op()
-      get_dep = tf.no_op()
-    else:
-      put_dep = tf.no_op()
-      get_dep = self._slow_tensorflow_op()
-    with tf.control_dependencies([put_dep]):
-      put_op = x.assign_add(batch_group_size, use_locking=True)
-    with tf.control_dependencies([get_dep]):
-      get_op = x.assign_sub(1, use_locking=True)
-    with self.test_session() as sess:
-      sess.run(tf.variables_initializer([x]))
-      image_producer = cnn_util.ImageProducer(sess, put_op, batch_group_size,
-                                              use_python32_barrier=False)
-      image_producer.start()
-      for _ in range(5 * batch_group_size):
-        sess.run(get_op)
-        # We assert x is nonnegative, to ensure image_producer never causes
-        # an unstage op to block. We assert x is at most 2 * batch_group_size,
-        # to ensure it doesn't use too much memory by storing too many batches
-        # in the staging area.
-        self.assertGreaterEqual(sess.run(x), 0)
-        self.assertLessEqual(sess.run(x), 2 * batch_group_size)
-        image_producer.notify_image_consumption()
-        self.assertGreaterEqual(sess.run(x), 0)
-        self.assertLessEqual(sess.run(x), 2 * batch_group_size)
-      image_producer.done()
-      time.sleep(0.1)
-      self.assertGreaterEqual(sess.run(x), 0)
-      self.assertLessEqual(sess.run(x), 2 * batch_group_size)
-  def test_image_producer(self):
-    self._test_image_producer(1, False)
-    self._test_image_producer(1, True)
-    self._test_image_producer(2, False)
-    self._test_image_producer(2, True)
-    self._test_image_producer(3, False)
-    self._test_image_producer(3, True)
-    self._test_image_producer(8, False)
-    self._test_image_producer(8, True)
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  tf.test.main()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/coco_metric.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/coco_metric.py
-# Copyright 2018 Google. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""COCO-style evaluation metrics.
-Forked from reference model implementation.
-COCO API: github.com/cocodataset/cocoapi/
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import atexit
-import tempfile
-from absl import flags
-import numpy as np
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-import six
-import tensorflow.compat.v1 as tf
-import mlperf
-import ssd_constants
-FLAGS = flags.FLAGS
-# https://github.com/cocodataset/cocoapi/issues/49
-if six.PY3:
-  import pycocotools.coco
-  pycocotools.coco.unicode = str
-def async_eval_runner(queue_predictions, queue_results, val_json_file):
-  """Load intermediate eval results and get COCO metrics."""
-  while True:
-    message = queue_predictions.get()
-    if message == 'STOP':  # poison pill
-      break
-    step, predictions = message
-    results = compute_map(predictions, val_json_file)
-    queue_results.put((step, results))
-def compute_map(predictions, val_json_file):
-  """Use model predictions to compute mAP.
-  Args:
-    predictions: a list of tuples returned by decoded_predictions function,
-      each containing the following elements:
-      image source_id, box coordinates in XYWH order, probability score, label
-    val_json_file: path to COCO annotation file
-  Returns:
-    A dictionary that maps all COCO metrics (keys) to their values
-  """
-  if val_json_file.startswith("gs://"):
-    _, local_val_json = tempfile.mkstemp(suffix=".json")
-    tf.gfile.Remove(local_val_json)
-    tf.gfile.Copy(val_json_file, local_val_json)
-    atexit.register(tf.gfile.Remove, local_val_json)
-  else:
-    local_val_json = val_json_file
-  cocoGt = COCO(local_val_json)
-  cocoDt = cocoGt.loadRes(np.array(predictions))
-  E = COCOeval(cocoGt, cocoDt, iouType='bbox')
-  E.evaluate()
-  E.accumulate()
-  E.summarize()
-  print("Current AP: {:.5f}".format(E.stats[0]))
-  metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
-                  'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
-  # Prefix with "COCO" to group in TensorBoard.
-  return {"COCO/" + key: value for key, value in zip(metric_names, E.stats)}
-def calc_iou(target, candidates):
-  target_tiled = np.tile(target[np.newaxis, :], (candidates.shape[0], 1))
-  # Left Top & Right Bottom
-  lt = np.maximum(target_tiled[:,:2], candidates[:,:2])
-  rb = np.minimum(target_tiled[:,2:], candidates[:,2:])
-  delta = np.maximum(rb - lt, 0)
-  intersect = delta[:,0] * delta[:,1]
-  delta1 = target_tiled[:,2:] - candidates[:,:2]
-  area1 = delta1[:,0] * delta1[:,1]
-  delta2 = target_tiled[:,2:] - candidates[:,:2]
-  area2 = delta2[:,0] * delta2[:,1]
-  iou = intersect/(area1 + area2 - intersect)
-  return iou
-# TODO(haoyuzhang): Rewrite this NumPy based implementation to TensorFlow based
-# implementation under ssd_model.py accuracy_function.
-def decode_predictions(labels_and_predictions):
-  """Decode predictions and remove unused boxes and labels."""
-  predictions = []
-  for example in labels_and_predictions:
-    source_id = int(example[ssd_constants.SOURCE_ID])
-    pred_box = example[ssd_constants.PRED_BOXES]
-    pred_scores = example[ssd_constants.PRED_SCORES]
-    locs, labels, probs = decode_single(
-        pred_box, pred_scores, ssd_constants.OVERLAP_CRITERIA,
-        ssd_constants.MAX_NUM_EVAL_BOXES, ssd_constants.MAX_NUM_EVAL_BOXES)
-    raw_height, raw_width, _ = example[ssd_constants.RAW_SHAPE]
-    for loc, label, prob in zip(locs, labels, probs):
-      # Ordering convention differs, hence [1], [0] rather than [0], [1]
-      x, y = loc[1] * raw_width, loc[0] * raw_height
-      w, h = (loc[3] - loc[1]) * raw_width, (loc[2] - loc[0]) * raw_height
-      predictions.append(
-          [source_id, x, y, w, h, prob, ssd_constants.CLASS_INV_MAP[label]])
-  mlperf.logger.log(key=mlperf.tags.NMS_THRESHOLD,
-                    value=ssd_constants.OVERLAP_CRITERIA)
-  mlperf.logger.log(key=mlperf.tags.NMS_MAX_DETECTIONS,
-                    value=ssd_constants.MAX_NUM_EVAL_BOXES)
-  return predictions
-def decode_single(bboxes_in, scores_in, criteria, max_output, max_num=200):
-  # Reference to https://github.com/amdegroot/ssd.pytorch
-  bboxes_out = []
-  scores_out = []
-  labels_out = []
-  for i, score in enumerate(np.split(scores_in, scores_in.shape[1], 1)):
-    score = np.squeeze(score, 1)
-    # skip background
-    if i == 0:
-      continue
-    mask = score > ssd_constants.MIN_SCORE
-    if not np.any(mask):
-      continue
-    bboxes, score = bboxes_in[mask, :], score[mask]
-    score_idx_sorted = np.argsort(score)
-    score_sorted = score[score_idx_sorted]
-    score_idx_sorted = score_idx_sorted[-max_num:]
-    candidates = []
-    # perform non-maximum suppression
-    while len(score_idx_sorted):
-      idx = score_idx_sorted[-1]
-      bboxes_sorted = bboxes[score_idx_sorted, :]
-      bboxes_idx = bboxes[idx, :]
-      iou = calc_iou(bboxes_idx, bboxes_sorted)
-      score_idx_sorted = score_idx_sorted[iou < criteria]
-      candidates.append(idx)
-    bboxes_out.append(bboxes[candidates, :])
-    scores_out.append(score[candidates])
-    labels_out.extend([i]*len(candidates))
-  if len(scores_out) == 0:
-    tf.logging.info("No objects detected. Returning dummy values.")
-    return (
-        np.zeros(shape=(1, 4), dtype=np.float32),
-        np.zeros(shape=(1,), dtype=np.int32),
-        np.ones(shape=(1,), dtype=np.float32) * ssd_constants.DUMMY_SCORE,
-    )
-  bboxes_out = np.concatenate(bboxes_out, axis=0)
-  scores_out = np.concatenate(scores_out, axis=0)
-  labels_out = np.array(labels_out)
-  max_ids = np.argsort(scores_out)[-max_output:]
-  return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/constants.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/constants.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Constants used in tf_cnn_benchmarks."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from enum import Enum
-# Results fetched with this prefix will not be reduced. Instead, they will be
-# passed as matrices to model's postprocess function.
-UNREDUCED_ACCURACY_OP_PREFIX = "tensor:"
-# Eval result values with this name prefix will be included in summary.
-SIMPLE_VALUE_RESULT_PREFIX = "simple_value:"
-class BenchmarkMode(object):
-  """Benchmark running mode."""
-  TRAIN = "training"
-  EVAL = "evaluation"
-  TRAIN_AND_EVAL = "training + evaluation"
-  FORWARD_ONLY = "forward only"
-class NetworkTopology(str, Enum):
-  """Network topology describes how multiple GPUs are inter-connected.
-  """
-  # DGX-1 uses hybrid cube mesh topology with the following device peer to peer
-  # matrix:
-  # DMA: 0 1 2 3 4 5 6 7
-  # 0:   Y Y Y Y Y N N N
-  # 1:   Y Y Y Y N Y N N
-  # 2:   Y Y Y Y N N Y N
-  # 3:   Y Y Y Y N N N Y
-  # 4:   Y N N N Y Y Y Y
-  # 5:   N Y N N Y Y Y Y
-  # 6:   N N Y N Y Y Y Y
-  # 7:   N N N Y Y Y Y Y
-  DGX1 = "dgx1"
-  # V100 in GCP are connected with the following device peer to peer matrix.
-  # In this topology, bandwidth of the connection depends on if it uses NVLink
-  # or PCIe link.
-  # DMA: 0 1 2 3 4 5 6 7
-  # 0:   Y Y Y Y N Y N N
-  # 1:   Y Y Y Y N N N N
-  # 2:   Y Y Y Y N N N Y
-  # 3:   Y Y Y Y N N N N
-  # 4:   N N N N Y Y Y Y
-  # 5:   Y N N N Y Y Y Y
-  # 6:   N N N N Y Y Y Y
-  # 7:   N N Y N Y Y Y Y
-  GCP_V100 = "gcp_v100"
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/convnet_builder.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/convnet_builder.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""CNN builder."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from collections import defaultdict
-import contextlib
-import numpy as np
-import tensorflow.compat.v1 as tf
-# pylint: disable=g-direct-tensorflow-import
-import mlperf
-from tensorflow.python.layers import convolutional as conv_layers
-from tensorflow.python.layers import core as core_layers
-from tensorflow.python.layers import normalization as normalization_layers
-from tensorflow.python.layers import pooling as pooling_layers
-from tensorflow.python.training import moving_averages
-_data_format_to_channel_axis = {'NCHW': 1, 'NHWC': 3}
-class ConvNetBuilder(object):
-  """Builder of cnn net."""
-  def __init__(self,
-               input_op,
-               input_nchan,
-               phase_train,
-               use_tf_layers,
-               data_format='NCHW',
-               dtype=tf.float32,
-               variable_dtype=tf.float32):
-    self.top_layer = input_op
-    self.top_size = input_nchan
-    self.phase_train = phase_train
-    self.use_tf_layers = use_tf_layers
-    self.data_format = data_format
-    self.dtype = dtype
-    self.variable_dtype = variable_dtype
-    self.counts = defaultdict(lambda: 0)
-    self.use_batch_norm = False
-    self.batch_norm_config = {}  # 'decay': 0.997, 'scale': True}
-    self.channel_pos = ('channels_last'
-                        if data_format == 'NHWC' else 'channels_first')
-    self.aux_top_layer = None
-    self.aux_top_size = 0
-  def get_custom_getter(self):
-    """Returns a custom getter that this class's methods must be called under.
-    All methods of this class must be called under a variable scope that was
-    passed this custom getter. Example:
-    ```python
-    network = ConvNetBuilder(...)
-    with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
-      network.conv(...)
-      # Call more methods of network here
-    ```
-    Currently, this custom getter only does anything if self.use_tf_layers is
-    True. In that case, it causes variables to be stored as dtype
-    self.variable_type, then casted to the requested dtype, instead of directly
-    storing the variable as the requested dtype.
-    """
-    def inner_custom_getter(getter, *args, **kwargs):
-      """Custom getter that forces variables to have type self.variable_type."""
-      if not self.use_tf_layers:
-        return getter(*args, **kwargs)
-      requested_dtype = kwargs['dtype']
-      if not (requested_dtype == tf.float32 and
-              self.variable_dtype == tf.float16):
-        # Only change the variable dtype if doing so does not decrease variable
-        # precision.
-        kwargs['dtype'] = self.variable_dtype
-      var = getter(*args, **kwargs)
-      # This if statement is needed to guard the cast, because batch norm
-      # assigns directly to the return value of this custom getter. The cast
-      # makes the return value not a variable so it cannot be assigned. Batch
-      # norm variables are always in fp32 so this if statement is never
-      # triggered for them.
-      if var.dtype.base_dtype != requested_dtype:
-        var = tf.cast(var, requested_dtype)
-      return var
-    return inner_custom_getter
-  @contextlib.contextmanager
-  def switch_to_aux_top_layer(self):
-    """Context that construct cnn in the auxiliary arm."""
-    if self.aux_top_layer is None:
-      raise RuntimeError('Empty auxiliary top layer in the network.')
-    saved_top_layer = self.top_layer
-    saved_top_size = self.top_size
-    self.top_layer = self.aux_top_layer
-    self.top_size = self.aux_top_size
-    yield
-    self.aux_top_layer = self.top_layer
-    self.aux_top_size = self.top_size
-    self.top_layer = saved_top_layer
-    self.top_size = saved_top_size
-  def get_variable(self, name, shape, dtype, cast_dtype, *args, **kwargs):
-    # TODO(reedwm): Currently variables and gradients are transferred to other
-    # devices and machines as type `dtype`, not `cast_dtype`. In particular,
-    # this means in fp16 mode, variables are transferred as fp32 values, not
-    # fp16 values, which uses extra bandwidth.
-    var = tf.get_variable(name, shape, dtype, *args, **kwargs)
-    return tf.cast(var, cast_dtype)
-  def _conv2d_impl(self, input_layer, num_channels_in, filters, kernel_size,
-                   strides, padding, kernel_initializer):
-    if self.use_tf_layers:
-      return conv_layers.conv2d(input_layer, filters, kernel_size, strides,
-                                padding, self.channel_pos,
-                                kernel_initializer=kernel_initializer,
-                                use_bias=False)
-    else:
-      weights_shape = [kernel_size[0], kernel_size[1], num_channels_in, filters]
-      # We use the name 'conv2d/kernel' so the variable has the same name as its
-      # tf.layers equivalent. This way, if a checkpoint is written when
-      # self.use_tf_layers == True, it can be loaded when
-      # self.use_tf_layers == False, and vice versa.
-      weights = self.get_variable('conv2d/kernel', weights_shape,
-                                  self.variable_dtype, self.dtype,
-                                  initializer=kernel_initializer)
-      if self.data_format == 'NHWC':
-        strides = [1] + strides + [1]
-      else:
-        strides = [1, 1] + strides
-      return tf.nn.conv2d(input_layer, weights, strides, padding,
-                          data_format=self.data_format)
-  def conv(self,
-           num_out_channels,
-           k_height,
-           k_width,
-           d_height=1,
-           d_width=1,
-           mode='SAME',
-           input_layer=None,
-           num_channels_in=None,
-           use_batch_norm=None,
-           stddev=None,
-           activation='relu',
-           bias=0.0,
-           kernel_initializer=None):
-    """Construct a conv2d layer on top of cnn."""
-    if input_layer is None:
-      input_layer = self.top_layer
-    if num_channels_in is None:
-      num_channels_in = self.top_size
-    if stddev is not None and kernel_initializer is None:
-      kernel_initializer = tf.truncated_normal_initializer(stddev=stddev)
-    if kernel_initializer is None:
-      kernel_initializer = tf.variance_scaling_initializer()
-    name = 'conv' + str(self.counts['conv'])
-    self.counts['conv'] += 1
-    with tf.variable_scope(name):
-      strides = [1, d_height, d_width, 1]
-      if self.data_format == 'NCHW':
-        strides = [strides[0], strides[3], strides[1], strides[2]]
-      if mode != 'SAME_RESNET':
-        conv = self._conv2d_impl(input_layer, num_channels_in, num_out_channels,
-                                 kernel_size=[k_height, k_width],
-                                 strides=[d_height, d_width], padding=mode,
-                                 kernel_initializer=kernel_initializer)
-      else:  # Special padding mode for ResNet models
-        if d_height == 1 and d_width == 1:
-          conv = self._conv2d_impl(input_layer, num_channels_in,
-                                   num_out_channels,
-                                   kernel_size=[k_height, k_width],
-                                   strides=[d_height, d_width], padding='SAME',
-                                   kernel_initializer=kernel_initializer)
-        else:
-          rate = 1  # Unused (for 'a trous' convolutions)
-          kernel_height_effective = k_height + (k_height - 1) * (rate - 1)
-          pad_h_beg = (kernel_height_effective - 1) // 2
-          pad_h_end = kernel_height_effective - 1 - pad_h_beg
-          kernel_width_effective = k_width + (k_width - 1) * (rate - 1)
-          pad_w_beg = (kernel_width_effective - 1) // 2
-          pad_w_end = kernel_width_effective - 1 - pad_w_beg
-          padding = [[0, 0], [pad_h_beg, pad_h_end],
-                     [pad_w_beg, pad_w_end], [0, 0]]
-          if self.data_format == 'NCHW':
-            padding = [padding[0], padding[3], padding[1], padding[2]]
-          padded_input_layer = tf.pad(input_layer, padding)
-          conv = self._conv2d_impl(padded_input_layer, num_channels_in,
-                                   num_out_channels,
-                                   kernel_size=[k_height, k_width],
-                                   strides=[d_height, d_width], padding='VALID',
-                                   kernel_initializer=kernel_initializer)
-      if use_batch_norm is None:
-        use_batch_norm = self.use_batch_norm
-      mlperf.logger.log_conv2d(input_tensor=input_layer, output_tensor=conv,
-                               stride_height=d_height, stride_width=d_width,
-                               filters=num_out_channels,
-                               initializer=kernel_initializer,
-                               use_bias=not use_batch_norm and bias is not None)
-      if not use_batch_norm:
-        if bias is not None:
-          biases = self.get_variable('biases', [num_out_channels],
-                                     self.variable_dtype, self.dtype,
-                                     initializer=tf.constant_initializer(bias))
-          biased = tf.reshape(
-              tf.nn.bias_add(conv, biases, data_format=self.data_format),
-              conv.get_shape())
-        else:
-          biased = conv
-      else:
-        self.top_layer = conv
-        self.top_size = num_out_channels
-        biased = self.batch_norm(**self.batch_norm_config)
-      if activation == 'relu':
-        mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU)
-        conv1 = tf.nn.relu(biased)
-      elif activation == 'linear' or activation is None:
-        conv1 = biased
-      elif activation == 'tanh':
-        conv1 = tf.nn.tanh(biased)
-      else:
-        raise KeyError('Invalid activation type \'%s\'' % activation)
-      self.top_layer = conv1
-      self.top_size = num_out_channels
-      return conv1
-  def _pool(self,
-            pool_name,
-            pool_function,
-            k_height,
-            k_width,
-            d_height,
-            d_width,
-            mode,
-            input_layer,
-            num_channels_in):
-    """Construct a pooling layer."""
-    if input_layer is None:
-      input_layer = self.top_layer
-    else:
-      self.top_size = num_channels_in
-    name = pool_name + str(self.counts[pool_name])
-    self.counts[pool_name] += 1
-    if self.use_tf_layers:
-      pool = pool_function(
-          input_layer, [k_height, k_width], [d_height, d_width],
-          padding=mode,
-          data_format=self.channel_pos,
-          name=name)
-    else:
-      if self.data_format == 'NHWC':
-        ksize = [1, k_height, k_width, 1]
-        strides = [1, d_height, d_width, 1]
-      else:
-        ksize = [1, 1, k_height, k_width]
-        strides = [1, 1, d_height, d_width]
-      pool = tf.nn.max_pool(input_layer, ksize, strides, padding=mode,
-                            data_format=self.data_format, name=name)
-    if pool_name == 'mpool':
-      mlperf.logger.log_max_pool(input_tensor=input_layer,
-                                 output_tensor=pool)
-    self.top_layer = pool
-    return pool
-  def mpool(self,
-            k_height,
-            k_width,
-            d_height=2,
-            d_width=2,
-            mode='VALID',
-            input_layer=None,
-            num_channels_in=None):
-    """Construct a max pooling layer."""
-    return self._pool('mpool', pooling_layers.max_pooling2d, k_height, k_width,
-                      d_height, d_width, mode, input_layer, num_channels_in)
-  def apool(self,
-            k_height,
-            k_width,
-            d_height=2,
-            d_width=2,
-            mode='VALID',
-            input_layer=None,
-            num_channels_in=None):
-    """Construct an average pooling layer."""
-    return self._pool('apool', pooling_layers.average_pooling2d, k_height,
-                      k_width, d_height, d_width, mode, input_layer,
-                      num_channels_in)
-  def reshape(self, shape, input_layer=None):
-    if input_layer is None:
-      input_layer = self.top_layer
-    self.top_layer = tf.reshape(input_layer, shape)
-    self.top_size = shape[-1]  # HACK This may not always work
-    return self.top_layer
-  def affine(self,
-             num_out_channels,
-             input_layer=None,
-             num_channels_in=None,
-             bias=0.0,
-             stddev=None,
-             activation='relu'):
-    if input_layer is None:
-      input_layer = self.top_layer
-    if num_channels_in is None:
-      num_channels_in = self.top_size
-    name = 'affine' + str(self.counts['affine'])
-    self.counts['affine'] += 1
-    with tf.variable_scope(name):
-      init_factor = 2. if activation == 'relu' else 1.
-      stddev = stddev or np.sqrt(init_factor / num_channels_in)
-      kernel = self.get_variable(
-          'weights', [num_channels_in, num_out_channels],
-          self.variable_dtype, self.dtype,
-          initializer=tf.truncated_normal_initializer(stddev=stddev))
-      biases = self.get_variable('biases', [num_out_channels],
-                                 self.variable_dtype, self.dtype,
-                                 initializer=tf.constant_initializer(bias))
-      mlperf.logger.log(key=mlperf.tags.MODEL_HP_DENSE,
-                        value=num_out_channels)
-      logits = tf.nn.xw_plus_b(input_layer, kernel, biases)
-      if activation == 'relu':
-        mlperf.logger.log(key=mlperf.tags.MODEL_HP_RELU)
-        affine1 = tf.nn.relu(logits, name=name)
-      elif activation == 'linear' or activation is None:
-        affine1 = logits
-      else:
-        raise KeyError('Invalid activation type \'%s\'' % activation)
-      self.top_layer = affine1
-      self.top_size = num_out_channels
-      return affine1
-  def inception_module(self, name, cols, input_layer=None, in_size=None):
-    if input_layer is None:
-      input_layer = self.top_layer
-    if in_size is None:
-      in_size = self.top_size
-    name += str(self.counts[name])
-    self.counts[name] += 1
-    with tf.variable_scope(name):
-      col_layers = []
-      col_layer_sizes = []
-      for c, col in enumerate(cols):
-        col_layers.append([])
-        col_layer_sizes.append([])
-        for l, layer in enumerate(col):
-          ltype, args = layer[0], layer[1:]
-          kwargs = {
-              'input_layer': input_layer,
-              'num_channels_in': in_size
-          } if l == 0 else {}
-          if ltype == 'conv':
-            self.conv(*args, **kwargs)
-          elif ltype == 'mpool':
-            self.mpool(*args, **kwargs)
-          elif ltype == 'apool':
-            self.apool(*args, **kwargs)
-          elif ltype == 'share':  # Share matching layer from previous column
-            self.top_layer = col_layers[c - 1][l]
-            self.top_size = col_layer_sizes[c - 1][l]
-          else:
-            raise KeyError(
-                'Invalid layer type for inception module: \'%s\'' % ltype)
-          col_layers[c].append(self.top_layer)
-          col_layer_sizes[c].append(self.top_size)
-      catdim = 3 if self.data_format == 'NHWC' else 1
-      self.top_layer = tf.concat([layers[-1] for layers in col_layers], catdim)
-      self.top_size = sum([sizes[-1] for sizes in col_layer_sizes])
-      return self.top_layer
-  def spatial_mean(self, keep_dims=False):
-    name = 'spatial_mean' + str(self.counts['spatial_mean'])
-    self.counts['spatial_mean'] += 1
-    axes = [1, 2] if self.data_format == 'NHWC' else [2, 3]
-    self.top_layer = tf.reduce_mean(
-        self.top_layer, axes, keepdims=keep_dims, name=name)
-    return self.top_layer
-  def dropout(self, keep_prob=0.5, input_layer=None):
-    if input_layer is None:
-      input_layer = self.top_layer
-    else:
-      self.top_size = None
-    name = 'dropout' + str(self.counts['dropout'])
-    with tf.variable_scope(name):
-      if not self.phase_train:
-        keep_prob = 1.0
-      if self.use_tf_layers:
-        dropout = core_layers.dropout(input_layer, 1. - keep_prob,
-                                      training=self.phase_train)
-      else:
-        dropout = tf.nn.dropout(input_layer, keep_prob)
-      self.top_layer = dropout
-      return dropout
-  def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon):
-    """Batch normalization on `input_layer` without tf.layers."""
-    # We make this function as similar as possible to the
-    # tf.contrib.layers.batch_norm, to minimize the differences between using
-    # layers and not using layers.
-    shape = input_layer.shape
-    num_channels = shape[3] if self.data_format == 'NHWC' else shape[1]
-    beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32,
-                             initializer=tf.zeros_initializer())
-    if use_scale:
-      gamma = self.get_variable('gamma', [num_channels], tf.float32,
-                                tf.float32, initializer=tf.ones_initializer())
-    else:
-      gamma = tf.constant(1.0, tf.float32, [num_channels])
-    # For moving variables, we use tf.get_variable instead of self.get_variable,
-    # since self.get_variable returns the result of tf.cast which we cannot
-    # assign to.
-    moving_mean = tf.get_variable('moving_mean', [num_channels],
-                                  tf.float32,
-                                  initializer=tf.zeros_initializer(),
-                                  trainable=False)
-    moving_variance = tf.get_variable('moving_variance', [num_channels],
-                                      tf.float32,
-                                      initializer=tf.ones_initializer(),
-                                      trainable=False)
-    if self.phase_train:
-      bn, batch_mean, batch_variance = tf.nn.fused_batch_norm(
-          input_layer, gamma, beta, epsilon=epsilon,
-          data_format=self.data_format, is_training=True)
-      mean_update = moving_averages.assign_moving_average(
-          moving_mean, batch_mean, decay=decay, zero_debias=False)
-      variance_update = moving_averages.assign_moving_average(
-          moving_variance, batch_variance, decay=decay, zero_debias=False)
-      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update)
-      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update)
-    else:
-      bn, _, _ = tf.nn.fused_batch_norm(
-          input_layer, gamma, beta, mean=moving_mean,
-          variance=moving_variance, epsilon=epsilon,
-          data_format=self.data_format, is_training=False)
-    return bn
-  def batch_norm(self, input_layer=None, decay=0.999, scale=False,
-                 epsilon=0.001):
-    """Adds a Batch Normalization layer."""
-    if input_layer is None:
-      input_layer = self.top_layer
-    else:
-      self.top_size = None
-    name = 'batchnorm' + str(self.counts['batchnorm'])
-    self.counts['batchnorm'] += 1
-    center = True
-    with tf.variable_scope(name) as scope:
-      if self.use_tf_layers:
-        layer_obj = normalization_layers.BatchNormalization(
-            momentum=decay,
-            scale=scale,
-            epsilon=epsilon,
-            fused=True,
-            axis=_data_format_to_channel_axis[self.data_format],
-            # We pass this 'scope' argument for compatibility with checkpoints
-            # created with the contrib version of batch norm. tf_cnn_benchmarks
-            # used to use the contrib version.
-            _scope=scope,
-            center=center,
-            name=scope.name)
-        bn = layer_obj.apply(input_layer, training=self.phase_train)
-      else:
-        bn = self._batch_norm_without_layers(input_layer, decay, scale, epsilon)
-    self.top_layer = bn
-    self.top_size = bn.shape[3] if self.data_format == 'NHWC' else bn.shape[1]
-    self.top_size = int(self.top_size)
-    mlperf.logger.log_batch_norm(
-        input_tensor=input_layer, output_tensor=bn, momentum=decay,
-        epsilon=epsilon, center=center, scale=scale, training=self.phase_train)
-    return bn
-  def lrn(self, depth_radius, bias, alpha, beta):
-    """Adds a local response normalization layer."""
-    name = 'lrn' + str(self.counts['lrn'])
-    self.counts['lrn'] += 1
-    self.top_layer = tf.nn.lrn(
-        self.top_layer, depth_radius, bias, alpha, beta, name=name)
-    return self.top_layer