update TF code

441c8f40 · qianyj · ec90ad8e · 441c8f40 · 441c8f40 · 441c8f40
Commit 441c8f40 authored Aug 01, 2022 by qianyj
5 changed files
--- a/TensorFlow/ComputeVision/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/tfrecord_image_generator.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/tfrecord_image_generator.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generate black and white test TFRecords with Example protos.
+
+Each record within the TFRecord file is a
+serialized Example proto. The Example proto contains the following fields:
+
+  image/encoded: string containing JPEG encoded image in RGB colorspace
+  image/height: integer, image height in pixels
+  image/width: integer, image width in pixels
+  image/colorspace: string, specifying the colorspace, always 'RGB'
+  image/channels: integer, specifying the number of channels, always 3
+  image/format: string, specifying the format, always'JPEG'
+
+  image/filename: string containing the basename of the image file
+            e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
+  image/class/label: integer specifying the index in a classification layer.
+    The label ranges from [1, 1000] where 0 is not used.
+  image/class/synset: string specifying the unique ID of the label,
+    e.g. 'n01440764'
+  image/class/text: string specifying the human-readable version of the label
+    e.g. 'red fox, Vulpes vulpes'
+
+  image/object/bbox/xmin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/xmax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/label: integer specifying the index in a classification
+    layer. The label ranges from [1, 1000] where 0 is not used. Note this is
+    always identical to the image label.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+
+
+def _int64_feature(value):
+  """Wrapper for inserting int64 features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def _float_feature(value):
+  """Wrapper for inserting float features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+
+def _bytes_feature(value):
+  """Wrapper for inserting bytes features into Example proto."""
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def _convert_to_example(filename, image_buffer, label, synset, human, bbox,
+                        height, width):
+  """Build an Example proto for an example.
+
+  Args:
+    filename: string, path to an image file, e.g., '/path/to/example.JPG'
+    image_buffer: bytes, JPEG encoding of RGB image
+    label: integer, identifier for the ground truth for the network
+    synset: string, unique WordNet ID specifying the label, e.g., 'n02323233'
+    human: string, human-readable label, e.g., 'red fox, Vulpes vulpes'
+    bbox: list of bounding boxes; each box is a list of integers
+      specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong to
+      the same label as the image label.
+    height: integer, image height in pixels
+    width: integer, image width in pixels
+  Returns:
+    Example proto
+  """
+  xmin = []
+  ymin = []
+  xmax = []
+  ymax = []
+  for b in bbox:
+    assert len(b) == 4
+    # pylint: disable=expression-not-assigned
+    [l.append(point) for l, point in zip([xmin, ymin, xmax, ymax], b)]
+    # pylint: enable=expression-not-assigned
+
+  colorspace = b'RGB'
+  channels = 3
+  image_format = b'JPEG'
+
+  example = tf.train.Example(features=tf.train.Features(feature={
+      'image/height': _int64_feature(height),
+      'image/width': _int64_feature(width),
+      'image/colorspace': _bytes_feature(colorspace),
+      'image/channels': _int64_feature(channels),
+      'image/class/label': _int64_feature(label),
+      'image/class/synset': _bytes_feature(six.ensure_binary(synset)),
+      'image/class/text': _bytes_feature(six.ensure_binary(human)),
+      'image/object/bbox/xmin': _float_feature(xmin),
+      'image/object/bbox/xmax': _float_feature(xmax),
+      'image/object/bbox/ymin': _float_feature(ymin),
+      'image/object/bbox/ymax': _float_feature(ymax),
+      'image/object/bbox/label': _int64_feature([label] * len(xmin)),
+      'image/format': _bytes_feature(image_format),
+      'image/filename': _bytes_feature(os.path.basename(six.ensure_binary(
+          filename))),
+      'image/encoded': _bytes_feature(image_buffer)}))
+  return example
+
+
+class ImageCoder(object):
+  """Helper class that provides TensorFlow image coding utilities."""
+
+  def __init__(self):
+    # Create a single Session to run all image coding calls.
+    self._sess = tf.Session()
+
+    # Initializes function that converts PNG to JPEG data.
+    self._image = tf.placeholder(dtype=tf.uint8)
+    self._encode_jpeg = tf.image.encode_jpeg(
+        self._image, format='rgb', quality=100)
+
+  def encode_jpeg(self, image):
+    jpeg_image = self._sess.run(self._encode_jpeg,
+                                feed_dict={self._image: image})
+    return jpeg_image
+
+
+def _process_image(coder, name):
+  """Process a single image file.
+
+  If name is "train", a black image is returned. Otherwise, a white image is
+  returned.
+
+  Args:
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+    name: string, unique identifier specifying the data set.
+  Returns:
+    image_buffer: bytes, JPEG encoding of RGB image.
+    height: integer, image height in pixels.
+    width: integer, image width in pixels.
+  """
+  # Read the image file.
+  value = 0 if name == 'train' else 255
+  height = random.randint(30, 299)
+  width = random.randint(30, 299)
+  image = np.full((height, width, 3), value, np.uint8)
+
+  jpeg_data = coder.encode_jpeg(image)
+
+  return jpeg_data, height, width
+
+
+def _process_dataset(output_directory, num_classes, coder, name, num_images,
+                     num_shards):
+  """Process a complete data set and save it as a TFRecord.
+
+  Args:
+    output_directory: Where to put outputs.
+    num_classes: number of classes.
+    coder: Instance of an ImageCoder.
+    name: string, unique identifier specifying the data set.
+    num_images: number of images to generate.
+    num_shards: integer number of shards to create.
+  """
+  files_per_shard = num_images // num_shards
+  for shard in range(num_shards):
+    output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
+    output_file = os.path.join(output_directory, output_filename)
+    with tf.python_io.TFRecordWriter(output_file) as writer:
+      for i in range(files_per_shard):
+        index = shard * files_per_shard + i
+        image_buffer, height, width = _process_image(coder, name)
+
+        filename = '{}_{}_{}'.format(name, shard, i)
+        label = index % num_classes
+        synset = str(index)
+        human = name
+        bbox = [[0.1, 0.1, 0.9, 0.9]]
+        example = _convert_to_example(filename, image_buffer, label,
+                                      synset, human, bbox,
+                                      height, width)
+        writer.write(example.SerializeToString())
+
+
+def write_black_and_white_tfrecord_data(
+    output_directory, num_classes, num_train_images=512,
+    num_validation_images=128, train_shards=8, validation_shards=2):
+  """Writes black and white images in tfrecord format.
+
+  Training images are black and validation images are white.
+
+  Args:
+    output_directory: Where to put outputs.
+    num_classes: number of classes.
+    num_train_images: number of training images to generate.
+    num_validation_images: number of validation images to generate.
+    train_shards: integer number of training shards to create.
+    validation_shards: integer number of validation shards to create.
+  """
+
+  coder = ImageCoder()
+  _process_dataset(output_directory, num_classes, coder, 'validation',
+                   num_validation_images, validation_shards)
+  _process_dataset(output_directory, num_classes, coder, 'train',
+                   num_train_images, train_shards)
--- a/TensorFlow/ComputeVision/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/test_util.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/test_util.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Shared functionality across multiple test files."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from collections import namedtuple
+from contextlib import contextmanager
+import os
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+import benchmark_cnn
+import cnn_util
+import datasets
+import preprocessing
+from models import model
+from platforms import util as platforms_util
+from test_data import tfrecord_image_generator
+from tensorflow.core.protobuf import rewriter_config_pb2  # pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.platform import test
+
+
+@contextmanager
+def monkey_patch(obj, **kwargs):
+  """Context mgr to monkey patch attributes on an object (such as a module).
+
+  The attributes are patched back to their original value when the context
+  manager exits.
+
+  For example, to replace benchmark_cnn.get_data_type with an identity function,
+  do:
+
+  ```
+  with monkey_patch(benchmark_cnn, get_data_type=lambda x: x)
+    loss1 = benchmark_cnn.loss_function(1)  # loss1 will be 1
+  loss2 = benchmark_cnn.loss_function(params)  # Call the original function
+  ```
+
+  Args:
+    obj: The object (which can be a module) to monkey patch attributes on.
+    **kwargs: Dictionary mapping from attribute name to value that the attribute
+      will be patched with.
+  Yields:
+    Nothing.
+  """
+  old_values = {key: getattr(obj, key) for key in kwargs}
+  try:
+    for key, value in kwargs.items():
+      setattr(obj, key, value)
+    yield
+  finally:
+    for key, value in old_values.items():
+      setattr(obj, key, value)
+
+
+def monkey_patch_base_cluster_manager():
+  """Monkey patches get_cluster_manager to return a BaseClusterManager.
+
+  This function replaces platforms_util.get_cluster_manager with a function that
+  always return a BaseClusterManager.
+
+  This is useful for testing creating a graph in distributed mode, with only a
+  single process. GrpcClusterManager's constructor blocks until a cluster is set
+  up, which requires multiple processes to be created.
+  """
+  def get_test_cluster_manager(params, config_proto):
+    del config_proto
+    return cnn_util.BaseClusterManager(params)
+  platforms_util.get_cluster_manager = get_test_cluster_manager
+
+
+def print_and_add_to_list(print_list):
+  """Returns a function which prints the input, then adds it to print_list."""
+  def f(string):
+    print(string)
+    print_list.append(string)
+  return f
+
+
+TrainingOutput = namedtuple('TrainingOutput',
+                            ['loss', 'top_1_accuracy', 'top_5_accuracy'])
+
+
+EvalOutput = namedtuple('EvalOutput', ['top_1_accuracy', 'top_5_accuracy'])
+
+
+def get_training_outputs_from_logs(logs, print_training_accuracy):
+  """Returns a list of TrainingOutputs by parsing the logs of a training run.
+
+  Args:
+    logs: A list of strings, each which is a line from the standard output of
+      tf_cnn_benchmarks from training. Only lines in the form:
+        10 images/sec: 14.2 +/- 0.0 (jitter = 0.0) 7.020
+      are parsed (the line may also contain the training accuracies).
+    print_training_accuracy: The value of the param print_training_accuracy.
+  Returns:
+    A list of TrainingOutputs. The list has one element per element of logs
+    that is in the format above. top_1_accuracy and top_5_accuracy are set to -1
+    if the line does not contain accuracies.
+  """
+  outputs = []
+  for log in logs:
+    if 'images/sec' in log and '+/-' in log:
+      parts = log.split()
+      if print_training_accuracy:
+        # Example log with training accuracy:
+        #   10 images/sec: 0.2 +/- 0.0 (jitter = 0.0) 6.908 0.500 1.000
+        assert len(parts) == 11
+        top_1_acc = float(parts[9])
+        top_5_acc = float(parts[10])
+      else:
+        # Example log without training accuracy:
+        #   10 images/sec: 0.2 +/- 0.0 (jitter = 0.0) 6.908
+        assert len(parts) == 9
+        top_1_acc = -1
+        top_5_acc = -1
+      loss = float(parts[8])
+      outputs.append(TrainingOutput(loss=loss, top_1_accuracy=top_1_acc,
+                                    top_5_accuracy=top_5_acc))
+  assert len(outputs) >= 1
+  return outputs
+
+
+def get_evaluation_outputs_from_logs(logs):
+  """Returns the top 1 and 5 accuracies by parsing the logs of an eval run.
+
+  Args:
+    logs: A list of strings, each which is a line from the standard output of
+      tf_cnn_benchmarks from evaluation. Only lines in the form:
+        Accuracy @ 1 = 0.5000 Accuracy @ 5 = 1.0000 [80 examples]
+      is parsed.
+  Returns:
+    A list of EvalOutputs. Normally this list only has one EvalOutput, but can
+    contain multiple if training is done and
+    --eval_during_training_every_n_steps is specified.
+  """
+  eval_outputs = []
+  for log in logs:
+    if 'Accuracy @ ' in log:
+      # Example log:
+      #   Accuracy @ 1 = 0.5000 Accuracy @ 5 = 1.0000 [80 examples]
+      parts = log.split()
+      assert len(parts) == 12
+      top_1_accuracy = float(parts[4])
+      top_5_accuracy = float(parts[9])
+      eval_outputs.append(EvalOutput(top_1_accuracy, top_5_accuracy))
+  assert eval_outputs
+  return eval_outputs
+
+
+def check_training_outputs_are_reasonable(testcase, training_outputs,
+                                          print_training_accuracy,
+                                          max_final_loss=10.,
+                                          previous_final_loss=None):
+  """Checks the outputs from training a model are reasonable.
+
+  An assert is failed if the outputs are not reasonable. The final top-1 and
+  top-5 accuracies are asserted to be 1, and so the dataset used to train should
+  be trivial to learn. For example, the dataset could consist of a black image
+  with label 0 and a white image with label 1.
+
+  Args:
+    testcase: A tf.test.TestCase used for assertions.
+    training_outputs: A list of TrainingOutputs, as returned from
+      get_training_outputs_from_logs().
+    print_training_accuracy: Whether training accuracies were printed and stored
+      in training_outputs.
+    max_final_loss: The loss of the final training output is asserted to be at
+      most this value.
+    previous_final_loss: If training was resumed from a checkpoint, the loss of
+      the final step from the previous training run that saved the checkpoint.
+  """
+  if previous_final_loss is not None:
+    # Ensure the loss hasn't raised significantly from the final loss of the
+    # previous training run.
+    testcase.assertLessEqual(training_outputs[0].loss,
+                             previous_final_loss * 1.01)
+  for output in training_outputs:
+    testcase.assertLessEqual(output.loss, 100.)
+  last_output = training_outputs[-1]
+  if print_training_accuracy:
+    testcase.assertEqual(last_output.top_1_accuracy, 1.0)
+    testcase.assertEqual(last_output.top_5_accuracy, 1.0)
+  if max_final_loss is not None:
+    testcase.assertLessEqual(last_output.loss, max_final_loss)
+
+
+def train_and_eval(testcase,
+                   run_fn,
+                   params,
+                   check_output_values,
+                   max_final_loss=10.,
+                   skip=None):
+  """Trains a model then evaluates it.
+
+  This function should be used to verify training and evaluating
+  BenchmarkCNN works without crashing and that it outputs reasonable
+  values. BenchmarkCNN will be run three times. First, it will train a
+  model from scratch, saving a checkpoint. Second, it will load the checkpoint
+  to continue training. Finally, it evaluates based on the loaded checkpoint.
+
+  Args:
+    testcase: A tf.test.TestCase used for assertions.
+    run_fn: Must run `BenchmarkCNN` exactly once. BenchmarkCNN is
+      never used directly, but instead is only run through `run_fn`. `run_fn`
+      has the signature (run_type, inner_params) -> output_list, where:
+        * run_type is a string indicating how BenchmarkCNN will be run.
+          Either 'InitialTraining', 'TrainingFromCheckpoint' or 'Evaluation'.
+        * inner_params is the params BenchmarkCNN should be run with.
+        * output_list[i] is a list of lines from the ith worker's stdout.
+    params: The params BenchmarkCNN will be run with.
+      Will be passed to `run_fn` slightly modified in order to run with both
+      training and evaluation.
+    check_output_values: Whether the outputs of the workers, such as training
+      accuracy, should be checked to make sure their values are reasonable.
+      Fails an assert on `testcase` if a check fails.
+    max_final_loss: The loss of the final training output is asserted to be at
+      most this value for both training runs.
+    skip: If 'eval', evaluation is not done. if
+      'eval_and_train_from_checkpoint', evaluation and training from a
+      checkpoint are both not done.
+  """
+
+  assert not skip or skip in {'eval', 'eval_and_train_from_checkpoint'}
+
+  # Part 1: Train from scratch.
+  tf.logging.info('Training model from scratch')
+  print_training_accuracy = (params.print_training_accuracy or
+                             params.forward_only)
+  initial_train_logs = run_fn('InitialTraining', params)
+  testcase.assertGreaterEqual(len(initial_train_logs), 1)
+  for lines in initial_train_logs:
+    initial_train_outputs = get_training_outputs_from_logs(
+        lines, print_training_accuracy)
+    if params.cross_replica_sync and params.batch_group_size == 1:
+      testcase.assertEqual(len(initial_train_outputs), params.num_batches)
+    if check_output_values:
+      check_training_outputs_are_reasonable(testcase, initial_train_outputs,
+                                            print_training_accuracy,
+                                            max_final_loss=max_final_loss)
+  if params.train_dir is not None:
+    train_dir_entries = set(os.listdir(params.train_dir))
+    testcase.assertGreater(len(train_dir_entries), 0)
+  else:
+    train_dir_entries = None
+
+  if skip == 'eval_and_train_from_checkpoint':
+    return
+
+  # Part 2: Train from the loaded checkpoint.
+  testcase.assertIsNotNone(train_dir_entries)
+  tf.logging.info('Training model from loaded checkpoint')
+  # Run for same number of batches as before.
+  params = params._replace(num_batches=params.num_batches * 2)
+  train_logs_from_ckpt = run_fn('TrainingFromCheckpoint', params)
+  testcase.assertGreaterEqual(len(train_logs_from_ckpt), 1)
+  for lines in train_logs_from_ckpt:
+    train_outputs_from_ckpt = get_training_outputs_from_logs(
+        lines, print_training_accuracy)
+    if params.cross_replica_sync and params.batch_group_size == 1:
+      testcase.assertEqual(len(train_outputs_from_ckpt),
+                           params.num_batches // 2 - params.num_warmup_batches)
+    if check_output_values:
+      check_training_outputs_are_reasonable(
+          testcase, train_outputs_from_ckpt, print_training_accuracy,
+          max_final_loss=max_final_loss,
+          previous_final_loss=initial_train_outputs[-1].loss)
+  # Ensure a new checkpoint was written out.
+  testcase.assertNotEqual(train_dir_entries, set(os.listdir(params.train_dir)))
+
+  if skip == 'eval':
+    return
+
+  # Part 3: Evaluate from the loaded checkpoint.
+  tf.logging.info('Evaluating model from checkpoint')
+  params = params._replace(num_batches=params.num_batches // 2, eval=True)
+  eval_logs = run_fn('Evaluation', params)
+  testcase.assertGreaterEqual(len(eval_logs), 1)
+  for lines in eval_logs:
+    eval_outputs = get_evaluation_outputs_from_logs(lines)
+    assert len(eval_outputs) == 1
+    top_1_accuracy, top_5_accuracy = eval_outputs[0]
+    if check_output_values:
+      testcase.assertEqual(top_1_accuracy, 1.0)
+      testcase.assertEqual(top_5_accuracy, 1.0)
+
+
+def get_temp_dir(dir_name):
+  dir_path = os.path.join(test.get_temp_dir(), dir_name)
+  os.mkdir(dir_path)
+  return dir_path
+
+
+def create_black_and_white_images():
+  dir_path = get_temp_dir('black_and_white_images')
+  tfrecord_image_generator.write_black_and_white_tfrecord_data(dir_path,
+                                                               num_classes=1)
+  return dir_path
+
+
+def get_params(train_dir_name):
+  """Returns params that can be used to train."""
+  params = benchmark_cnn.make_params(
+      batch_size=2,
+      display_every=1,
+      init_learning_rate=0.005,
+      model='trivial',
+      num_batches=20,
+      num_gpus=2,
+      num_warmup_batches=5,
+      optimizer='sgd',
+      print_training_accuracy=True,
+      train_dir=get_temp_dir(train_dir_name),
+      variable_update='parameter_server',
+      weight_decay=0,
+      distortions=True,
+      distort_color_in_yiq=False)
+  return benchmark_cnn.set_default_param_values_and_env_vars(params)
+
+
+def get_var_update_params():
+  """Returns params that are used when testing variable updates."""
+  params = benchmark_cnn.make_params(
+      batch_size=2,
+      model='test_model',
+      num_gpus=2,
+      display_every=1,
+      num_warmup_batches=0,
+      num_batches=4,
+      weight_decay=2 ** -4,
+      init_learning_rate=2 ** -4,
+      optimizer='sgd')
+  return benchmark_cnn.set_default_param_values_and_env_vars(params)
+
+
+def get_fake_var_update_inputs():
+  """Returns fake input 1x1 images to use in variable update tests."""
+  # BenchmarkCNN divides by 127.5 then subtracts 1.0 from the images, so after
+  # that, the images will be -1., 0., 1., ..., 14.
+  return np.resize(127.5 * np.array(range(16)), (16, 1, 1, 1))
+
+
+def _worker_batches_in_numpy_array(numpy_inputs, batch_size, shift_ratio):
+  """Yields batches from a numpy array, for a single worker."""
+  numpy_inputs = cnn_util.roll_numpy_batches(numpy_inputs, batch_size,
+                                             shift_ratio)
+  i = 0
+  total_batches = numpy_inputs.shape[0]
+  assert total_batches % batch_size == 0
+  while True:
+    yield numpy_inputs[i:i + batch_size, ...]
+    i = (i + batch_size) % total_batches
+
+
+def manually_compute_losses(numpy_inputs, inputs_placeholder, loss, num_workers,
+                            params):
+  """Manually compute the losses each worker should report in tf_cnn_benchmarks.
+
+  This function essentially simulates tf_cnn_benchmarks, computing what the loss
+  of each worker should be. The caller should create a model, that takes in
+  images from `inputs_placeholder`, a tf.placeholder, and computes `loss`.
+
+  This function, and all ops passed to this function, must be run under a
+  tf.device('cpu:0') context manager.
+
+  Non-SGD optimizers are not supported with multiple workers.
+
+  Args:
+    numpy_inputs: A Numpy array to use as the input images.
+    inputs_placeholder: A tf.placeholder tensor, where input images can be fed
+      into.
+    loss: A scalar tensor representing the loss of the model, which is obtained
+      from the input images in inputs_placeholder.
+    num_workers: How many workers should be simulated.
+    params: Params tuple. This doesn't have to have information about the
+      distributed cluster, such as --num_workers, as num_workers is passed in
+      separately.
+
+  Returns:
+    A list of list of losses. return_value[i][j] is the loss of the ith worker
+    after the jth step.
+  """
+  batch_size = params.batch_size * params.num_gpus
+  assert numpy_inputs.shape[0] % (num_workers * batch_size) == 0
+  l2_loss = tf.add_n([tf.nn.l2_loss(x) for x in tf.trainable_variables()])
+  total_loss = loss + params.weight_decay * l2_loss
+  reported_loss = (loss if params.loss_type_to_report == 'base_loss'
+                   else total_loss)
+  gradient_multiplier = 1
+  if params.variable_update in ('replicated', 'distributed_all_reduce'):
+    # In certain variable updates, tf_cnn_benchmarks add the gradients of the
+    # GPUs instead of taking their mean, making the gradients effectively
+    # params.num_gpu times higher.
+    # TODO(b/62722498): Make all variable updates consistent.
+    gradient_multiplier = params.num_gpus
+
+  opt = benchmark_cnn.get_optimizer(params, params.init_learning_rate)
+  grad_vars = opt.compute_gradients(
+      total_loss, grad_loss=tf.constant(gradient_multiplier, dtype=tf.float32))
+  grads = [g for g, _ in grad_vars]
+  # We apply gradients from a placeholder. That way, we can first compute the
+  # gradients from each worker, then afterwards apply them one by one by feeding
+  # them into the placeholder.
+  placeholder_grad_vars = [(tf.placeholder(g.dtype, g.shape), v)
+                           for g, v in grad_vars]
+  placeholder_grads = [g for g, _ in placeholder_grad_vars]
+  apply_grads_op = opt.apply_gradients(placeholder_grad_vars)
+
+  batch_iterators = [_worker_batches_in_numpy_array(numpy_inputs, batch_size,
+                                                    shift_ratio=i / num_workers)
+                     for i in range(num_workers)]
+  # Set the GPU count to 0, to avoid taking all the GPU memory. Unfortunately,
+  # doing so still takes up about ~1GB for some reason.
+  config = tf.ConfigProto(device_count={'GPU': 0})
+  config.graph_options.rewrite_options.pin_to_host_optimization = (
+      rewriter_config_pb2.RewriterConfig.OFF)
+  with tf.Session(config=config) as sess:
+    sess.run(tf.global_variables_initializer())
+    losses = [[] for _ in range(num_workers)]
+    for i in range(params.num_batches):
+      computed_grads = []
+      for j in range(num_workers):
+        batch_feed = next(batch_iterators[j])
+        batch_feed = batch_feed / 127.5 - 1
+        worker_loss, worker_grads = sess.run((reported_loss, grads),
+                                             {inputs_placeholder: batch_feed})
+        losses[j].append(worker_loss)
+        computed_grads.append(worker_grads)
+      for worker_grads in computed_grads:
+        # TODO(reedwm): With multiple workers, applying the gradients
+        # sequentially per worker is not equivalent to what tf_cnn_benchmarks
+        # does when the optmizer is not SGD. Therefore, this currently does not
+        # work currently when num_workers > 1 and params.optimizer != 'sgd'.
+        feed_dict = dict(zip(placeholder_grads, worker_grads))
+        sess.run(apply_grads_op, feed_dict)
+  return losses
+
+
+class TestCNNModel(model.CNNModel):
+  """A simple model used for testing.
+
+  The input is a 1-channel 1x1 image, consisting of a single number. The model
+  has two scalar variables: A and B, initialized to 1 and 2 respectively. Given
+  an image x, the loss is defined as:
+
+      loss = x * A * B
+  """
+
+  def __init__(self):
+    super(TestCNNModel, self).__init__(
+        'test_cnn_model', image_size=1, batch_size=1, learning_rate=1)
+    self.depth = 1
+
+  VAR_A_INITIAL_VALUE = 1.
+  VAR_B_INITIAL_VALUE = 2.
+
+  def add_inference(self, cnn):
+    # This model only supports 1x1 images with 1 channel
+    assert cnn.top_layer.shape[1:] == (1, 1, 1)
+    # Multiply by variable A.
+    with tf.name_scope('mult_by_var_A'):
+      cnn.conv(1, 1, 1, 1, 1, use_batch_norm=None, activation=None, bias=None,
+               kernel_initializer=tf.constant_initializer(
+                   self.VAR_A_INITIAL_VALUE))
+    # Multiply by variable B.
+    with tf.name_scope('mult_by_var_B'):
+      cnn.conv(1, 1, 1, 1, 1, use_batch_norm=None, activation=None, bias=None,
+               kernel_initializer=tf.constant_initializer(
+                   self.VAR_B_INITIAL_VALUE))
+    with tf.name_scope('reshape_to_scalar'):
+      cnn.reshape([-1, 1])
+
+  def skip_final_affine_layer(self):
+    return True
+
+  def loss_function(self, inputs, build_network_result):
+    del inputs
+    return tf.reduce_mean(build_network_result.logits)
+
+  def manually_compute_losses(self, inputs, num_workers, params):
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+      a = tf.Variable(self.VAR_A_INITIAL_VALUE, name='A')
+      b = tf.Variable(self.VAR_B_INITIAL_VALUE, name='B')
+      inputs_placeholder = tf.placeholder(tf.float32,
+                                          (None, 1, 1, 1),
+                                          name='inputs_placeholder')
+      inputs_reshaped = tf.reshape(inputs_placeholder, (-1, 1))
+      loss = self.loss_function(
+          None,
+          model.BuildNetworkResult(logits=inputs_reshaped * a * b,
+                                   extra_info=None))
+      return manually_compute_losses(inputs, inputs_placeholder, loss,
+                                     num_workers, params)
+
+  def accuracy_function(self, inputs, logits):
+    del inputs
+    # Let the accuracy be the same as the loss function.
+    return {'top_1_accuracy': logits, 'top_5_accuracy': logits}
+
+
+class TestDataSet(datasets.ImageDataset):
+  """A Dataset consisting of 1x1 images with a depth of 1."""
+
+  def __init__(self, height=1, width=1, depth=1):
+    super(TestDataSet, self).__init__('test_dataset', height=height,
+                                      width=width, depth=depth, data_dir=None,
+                                      queue_runner_required=True, num_classes=1)
+
+  def num_examples_per_epoch(self, subset='train'):
+    del subset
+    return 1
+
+  def get_input_preprocessor(self, input_preprocessor='default'):
+    return preprocessing.TestImagePreprocessor
+
+  def use_synthetic_gpu_inputs(self):
+    return False
--- a/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/utils/logs/hooks_helper_test.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/ResNet50_Official/official/utils/logs/hooks_helper_test.py
@@ -13,55 +13,61 @@
 # limitations under the License.
 # ==============================================================================

-"""Tests for hooks_helper."""
+"""Benchmark script for TensorFlow.
+
+See the README for more information.
+"""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import unittest
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
+from absl import app
+from absl import flags as absl_flags
+import tensorflow.compat.v1 as tf

-from official.utils.logs import hooks_helper
+import benchmark_cnn
+import cnn_util
+import flags
+import mlperf
+from cnn_util import log_fn


-class BaseTest(unittest.TestCase):
+flags.define_flags()
+for name in flags.param_specs.keys():
+  absl_flags.declare_key_flag(name)

-  def test_raise_in_non_list_names(self):
-    with self.assertRaises(ValueError):
-      hooks_helper.get_train_hooks(
-          'LoggingTensorHook, ProfilerHook', model_dir="", batch_size=256)
+absl_flags.DEFINE_boolean(
+    'ml_perf_compliance_logging', False,
+    'Print logs required to be compliant with MLPerf. If set, must clone the '
+    'MLPerf training repo https://github.com/mlperf/training and add '
+    'https://github.com/mlperf/training/tree/master/compliance to the '
+    'PYTHONPATH')

-  def test_raise_in_invalid_names(self):
-    invalid_names = ['StepCounterHook', 'StopAtStepHook']
-    with self.assertRaises(ValueError):
-      hooks_helper.get_train_hooks(invalid_names, model_dir="", batch_size=256)

-  def validate_train_hook_name(self,
-                               test_hook_name,
-                               expected_hook_name,
-                               **kwargs):
-    returned_hook = hooks_helper.get_train_hooks(
-        [test_hook_name], model_dir="", **kwargs)
-    self.assertEqual(len(returned_hook), 1)
-    self.assertIsInstance(returned_hook[0], tf.train.SessionRunHook)
-    self.assertEqual(returned_hook[0].__class__.__name__.lower(),
-                     expected_hook_name)
+def main(positional_arguments):
+  # Command-line arguments like '--distortions False' are equivalent to
+  # '--distortions=True False', where False is a positional argument. To prevent
+  # this from silently running with distortions, we do not allow positional
+  # arguments.
+  assert len(positional_arguments) >= 1
+  if len(positional_arguments) > 1:
+    raise ValueError('Received unknown positional arguments: %s'
+                     % positional_arguments[1:])

-  def test_get_train_hooks_logging_tensor_hook(self):
-    self.validate_train_hook_name('LoggingTensorHook', 'loggingtensorhook')
+  params = benchmark_cnn.make_params_from_flags()
+  with mlperf.mlperf_logger(absl_flags.FLAGS.ml_perf_compliance_logging,
+                            params.model):
+    params = benchmark_cnn.setup(params)
+    bench = benchmark_cnn.BenchmarkCNN(params)

-  def test_get_train_hooks_profiler_hook(self):
-    self.validate_train_hook_name('ProfilerHook', 'profilerhook')
+    tfversion = cnn_util.tensorflow_version_tuple()
+    log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))

-  def test_get_train_hooks_examples_per_second_hook(self):
-    self.validate_train_hook_name('ExamplesPerSecondHook',
-                                  'examplespersecondhook')
+    bench.print_info()
+    bench.run()

-  def test_get_logging_metric_hook(self):
-    test_hook_name = 'LoggingMetricHook'
-    self.validate_train_hook_name(test_hook_name, 'loggingmetrichook')

 if __name__ == '__main__':
-  tf.test.main()
+  tf.disable_v2_behavior()
+  app.run(main)  # Raises error on invalid flags, unlike tf.app.run()
--- a/TensorFlow/ComputeVision/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/variable_mgr.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/variable_mgr.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines VariableMgr and subclasses used to manage variables.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import re
+
+import tensorflow.compat.v1 as tf
+
+import allreduce
+import batch_allreduce
+import variable_mgr_util
+
+
+class VariableMgr(object):
+  """Abstract superclass for class used by BenchmarkCNN to control variables.
+
+    Functions on this class are used to control how variables are created and
+    managed, and how gradients are computed and applied.
+  """
+
+  def __init__(self, benchmark_cnn):
+    self.benchmark_cnn = benchmark_cnn
+    self.staging_delta_ops = []
+    self.use_resource_vars = benchmark_cnn.params.use_resource_vars
+
+    # A variable for automatic loss scaling.
+    self.grad_has_inf_nan = None
+
+    self._reuse_vars = False
+
+  def each_tower_has_variables(self):
+    """Returns True if each GPU tower of the model has separate variables."""
+    assert False, 'Must be implemented in subclass'
+
+  def supports_staged_vars(self):
+    """Whether staged variable management is supported."""
+    return False
+
+  def create_outer_variable_scope(self, device_num):
+    """Create the tf.variable_scope around all model graph operations."""
+    del device_num  # unused by this implementation
+    assert False, 'Must be implemented in subclass'
+
+  def preprocess_device_grads(self, device_grads):
+    """Preprocess the device gradients prior to applying them.
+
+    Args:
+      device_grads: List of lists of (gradient, variable) tuples.
+        device_grads[t][g] = (gradient, variable), where t is the index of the
+        tower and g is the index of the gradient-variable pair.
+
+    Returns: a tuple of (apply_gradients_devices, gradient_state).
+      gradient_state is an opaque structure that should be passed to
+      get_gradients_to_apply() and append_apply_gradients_ops() (in that order).
+      apply_gradients_devices is a list of devices where the gradients will be
+      applied with get_gradients_to_apply() and append_apply_gradients_ops().
+    """
+    del device_grads  # unused by this implementation
+    assert False, 'Must be implemented in subclass'
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    """Returns the [(gradient, variable)] list to apply for device_num.
+
+    Args:
+      device_num: indexes into apply_gradients_devices, which was returned by an
+        earlier call to preprocess_device_grads.
+      gradient_state: from previous call to apply_gradients_devices.
+    """
+    del device_num, gradient_state  # unused by this implementation
+    assert False, 'Must be implemented in subclass'
+
+  def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops,
+                                 loss_scale_params):
+    """Adds training ops for grads to 'training_ops'.
+
+
+
+    Args:
+      gradient_state: from previous call to apply_gradients_devices.
+      opt: the underlying optimizer
+      grads: [(grad, var)] to apply
+      training_ops: list to which to add ops
+      loss_scale_params: parameters for loss scaling.
+    """
+    del gradient_state  # unused by this implementation
+
+    def get_apply_gradients_ops_func():
+      """Returns the apply_gradients op."""
+      return [opt.apply_gradients(grads)]
+
+    variable_mgr_util.append_gradients_with_loss_scale(
+        training_ops, get_apply_gradients_ops_func, loss_scale_params,
+        self.grad_has_inf_nan)
+
+  def get_post_init_ops(self):
+    """Returns ops that should run post-initialization."""
+    return []
+
+  def get_devices(self):
+    """Returns devices to use for computation; includes replica selection."""
+    assert False, 'Must be implemented in subclass'
+
+  def savable_variables(self):
+    """Returns a list/dict of savable variables to pass to tf.train.Saver."""
+    return tf.global_variables()
+
+  def trainable_variables_on_device(self,
+                                    rel_device_num,
+                                    abs_device_num,
+                                    writable=False):
+    """Return the set of trainable variables on device.
+
+    Args:
+      rel_device_num: local worker device index.
+      abs_device_num: global graph device index.
+      writable: whether to get a reference to the underlying variable.
+
+    Returns:
+      The set of trainable variables on the specified device.
+    """
+    del rel_device_num, writable
+    if self.each_tower_has_variables():
+      params = [
+          v for v in tf.trainable_variables()
+          if v.name.startswith('v%s/' % abs_device_num)
+      ]
+    else:
+      params = tf.trainable_variables()
+    return params
+
+  @contextlib.contextmanager
+  def reuse_variables(self):
+    """Context manager that causes variables requested to be reused.
+
+    Variables requested under this context manager must already exist, and will
+    be reused instead of being created again. This should be used if the
+    evaluation model is being built after the training model has already been
+    built. This is because the evaluation model should reuse variables from the
+    training model.
+
+    Yields:
+      Nothing.
+    """
+    old_reuse_vars = self._reuse_vars
+    try:
+      self._reuse_vars = True
+      yield
+    finally:
+      self._reuse_vars = old_reuse_vars
+
+
+class VariableMgrIndependent(VariableMgr):
+  """VariableMgr that implements the --independent mode for local jobs.
+
+     Each GPU has its own copy of the variables, and gradients are
+     not shared between towers. This can be used to check
+     performance when no data is moved between GPUs.
+  """
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars,
+                             use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    return (self.benchmark_cnn.devices, device_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state
+    tower_grad = device_grads[device_num]
+
+    if self.benchmark_cnn.enable_auto_loss_scale and device_num == 0:
+      # Since we don't aggregate variables in --independent mode, we cannot tell
+      # if there are NaNs on all GPUs. So we arbitrarily choose to only check
+      # NaNs on the first GPU.
+      has_inf_nan_list = []
+      for grad, _ in tower_grad:
+        has_inf_nan_list.append(tf.reduce_all(tf.is_finite(grad)))
+      self.grad_has_inf_nan = tf.logical_not(tf.reduce_all(has_inf_nan_list))
+
+    return tower_grad
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
+
+
+class VariableMgrLocalFetchFromPS(VariableMgr):
+  """VariableMgr that implements the --parameter_server mode for local jobs.
+
+     Variables are stored on a parameter server.  For each step, each tower gets
+     a copy of the variables from the parameter server, and sends its gradients
+     to the param server.
+  """
+
+  def each_tower_has_variables(self):
+    return False
+
+  def create_outer_variable_scope(self, device_num):
+    return tf.variable_scope('v', reuse=bool(device_num) or self._reuse_vars,
+                             use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    return ([self.benchmark_cnn.param_server_device], device_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    assert device_num == 0
+    device_grads = gradient_state
+    agg_grads, self.grad_has_inf_nan = (
+        variable_mgr_util.
+        aggregate_gradients_using_copy_with_variable_colocation(
+            device_grads,
+            use_mean=True,
+            check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
+    return agg_grads
+
+  def get_devices(self):
+    raw_devices = self.benchmark_cnn.raw_devices
+    if self.benchmark_cnn.local_parameter_device_flag == 'gpu':
+      return [
+          variable_mgr_util.ParamServerDeviceSetter(d, raw_devices)
+          for d in raw_devices
+      ]
+    else:
+      return [
+          tf.train.replica_device_setter(
+              worker_device=d,
+              ps_device=self.benchmark_cnn.param_server_device,
+              ps_tasks=1) for d in raw_devices
+      ]
+
+
+class VariableMgrLocalFetchFromStagedPS(VariableMgrLocalFetchFromPS):
+  """Implements fetching a local variable through staging buffers.
+  """
+
+  def __init__(self, benchmark_cnn):
+    super(VariableMgrLocalFetchFromStagedPS, self).__init__(benchmark_cnn)
+    # A data structure to track where the variables are used on each device.
+    # Indexed by device_num and var_name, each entry stores the "put" and "get"
+    # ops used for that variable on that device:
+    #   staging_vars_on_devices[device_num][var_name] == (put_op, get_op)
+    self.staging_vars_on_devices = [
+        dict() for _ in self.benchmark_cnn.raw_devices
+    ]
+
+  def supports_staged_vars(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    self._custom_getter = variable_mgr_util.StagedVariableGetter(
+        device_num, self.benchmark_cnn.raw_devices, None, self)
+    return tf.variable_scope(
+        'v', reuse=bool(device_num) or self._reuse_vars,
+        custom_getter=self._custom_getter, use_resource=self.use_resource_vars)
+
+  def trainable_variables_on_device(self,
+                                    rel_device_num,
+                                    abs_device_num,
+                                    writable=False):
+    return self._custom_getter.trainable_variables_on_device(
+        rel_device_num, abs_device_num, writable=writable)
+
+
+class VariableMgrLocalReplicated(VariableMgr):
+  """VariableMgr that implements the --replicated mode for local jobs.
+
+     Each GPU has its own copy of the variables. To apply gradients,
+     either a local all-reduce algorithm is applied or a regular
+     cross-device aggregation is used to replicate the combined
+     gradients to all towers.
+  """
+
+  def __init__(self, benchmark_cnn, all_reduce_spec,
+               agg_small_grads_max_bytes, agg_small_grads_max_group,
+               allreduce_merge_scope):
+    super(VariableMgrLocalReplicated, self).__init__(benchmark_cnn)
+    if all_reduce_spec:
+      spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
+      if len(spec) != 1:
+        raise ValueError(
+            'replicated mode does not support hybrid all-reduce strategies')
+      self._all_reduce_spec = spec[0]
+    else:
+      self._all_reduce_spec = None
+    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self._agg_small_grads_max_group = agg_small_grads_max_group
+    self._warmup_ops = []
+    self._allreduce_merge_scope = allreduce_merge_scope
+    self._gradient_put_ops = None
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars,
+                             use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    compact_grads = (self.benchmark_cnn.params.use_fp16 and
+                     self.benchmark_cnn.params.compact_gradient_transfer)
+    defer_grads = (self.benchmark_cnn.params.variable_consistency == 'relaxed')
+
+    grads_to_reduce = [[g for g, _ in grad_vars] for grad_vars in device_grads]
+    algorithm = batch_allreduce.algorithm_from_params(self.benchmark_cnn.params)
+    reduced_grads, self._warmup_ops = algorithm.batch_all_reduce(
+        grads_to_reduce, self.benchmark_cnn.params.gradient_repacking,
+        compact_grads, defer_grads, self.benchmark_cnn.params.xla_compile)
+    if self.benchmark_cnn.enable_auto_loss_scale:
+      # Check for infs or nans
+      is_finite_list = []
+      with tf.name_scope('check_for_inf_and_nan'):
+        for tower_grads in reduced_grads:
+          with tf.colocate_with(tower_grads[0]):
+            # TODO(tanmingxing): Create fused op that takes in a list of tensors
+            # as input and returns scalar boolean True if there are any
+            # infs/nans.
+            is_finite_list.append(tf.reduce_all(
+                [tf.reduce_all(tf.is_finite(g)) for g in tower_grads]))
+        self.grad_has_inf_nan = tf.logical_not(tf.reduce_all(is_finite_list))
+    reduced_device_grads = [[
+        (g, v) for g, (_, v) in zip(grads, grad_vars)
+    ] for grads, grad_vars in zip(reduced_grads, device_grads)]
+    return self.benchmark_cnn.devices, reduced_device_grads
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state
+    return device_grads[device_num]
+
+  def get_post_init_ops(self):
+    # Copy initialized values for variables on GPU 0 to other GPUs.
+    global_vars = tf.global_variables()
+    var_by_name = dict([(v.name, v) for v in global_vars])
+    post_init_ops = []
+    for v in global_vars:
+      split_name = v.name.split('/')
+      # TODO(b/62630508): use more specific prefix than v or v0.
+      if split_name[0] == 'v0' or not v.name.startswith('v'):
+        continue
+      split_name[0] = 'v0'
+      copy_from = var_by_name['/'.join(split_name)]
+      post_init_ops.append(v.assign(copy_from.read_value()))
+    post_init_ops += self._warmup_ops
+    return post_init_ops
+
+  def savable_variables(self):
+    """Return the set of variables used for saving/loading the model."""
+    params = []
+    for v in tf.global_variables():
+      split_name = v.name.split('/')
+      if split_name[0] == 'v0' or not v.name.startswith('v'):
+        params.append(v)
+    return params
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
+
+
+class VariableMgrDistributedAllReduce(VariableMgr):
+  """VariableMgr that implements the --distributed_all_reduce mode.
+
+     Each GPU has its own copy of the variables. To apply gradients,
+     the specified all-reduce algorithm is used to reduce the gradients
+     and replicate the final value to all GPUs.
+  """
+
+  def __init__(self, benchmark_cnn, all_reduce_spec, job_name,
+               num_workers, agg_small_grads_max_bytes,
+               agg_small_grads_max_group, allreduce_merge_scope):
+    super(VariableMgrDistributedAllReduce, self).__init__(benchmark_cnn)
+    if not all_reduce_spec:
+      raise ValueError(
+          'distributed_all_reduce requires a non-empty all_reduce_spec')
+    self._all_reduce_spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
+    self._all_reduce_device_prefixes = (
+        allreduce.build_all_reduce_device_prefixes(job_name, num_workers))
+    self._num_workers = num_workers
+    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self._agg_small_grads_max_group = agg_small_grads_max_group
+    self._allreduce_merge_scope = allreduce_merge_scope
+    if not self._all_reduce_spec:
+      raise ValueError('all_reduce_spec must be specified')
+    self._single_session = True
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    """Create a scope for the named device.
+
+    Args:
+      device_num: index of device for variable scope. (Note that
+        device_num spans all processes in cluster since a single global
+        graph is used.)
+
+    Returns:
+      the requested variable_scope
+    """
+    return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars,
+                             use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    remaining_grads = device_grads
+    aggregated_grads = []
+    for spec_tuple in self._all_reduce_spec:
+      if spec_tuple.limit < 0:
+        this_grads = remaining_grads
+        remaining_grads = []
+      else:
+        (this_grads, remaining_grads) = allreduce.split_grads_by_size(
+            spec_tuple.limit, remaining_grads)
+      if this_grads:
+        range_agg_grads = allreduce.sum_gradients_all_reduce(
+            self._single_session,
+            self._all_reduce_device_prefixes,
+            this_grads,
+            self._num_workers,
+            spec_tuple.alg,
+            spec_tuple.shards,
+            self.benchmark_cnn.gpu_indices,
+            agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
+            agg_small_grads_max_group=self._agg_small_grads_max_group,
+            allreduce_merge_scope=self._allreduce_merge_scope)
+        if not aggregated_grads:
+          aggregated_grads = range_agg_grads
+        else:
+          assert len(aggregated_grads) == len(range_agg_grads)
+          for i in range(len(aggregated_grads)):
+            aggregated_grads[i] += range_agg_grads[i]
+    assert not remaining_grads
+    full_device_set = []
+    for grads in device_grads:
+      g, v = grads[0]
+      del v
+      full_device_set.append(g.device)
+    return (full_device_set, aggregated_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state
+    if device_num >= len(device_grads):
+      raise ValueError('device_num %d exceeds length of device_grads (%d)' %
+                       (device_num, len(device_grads)))
+    return device_grads[device_num]
+
+  def get_post_init_ops(self):
+    """Copy initialized values for variables to other devices."""
+    global_vars = tf.global_variables()
+    var_by_name = dict([(v.name, v) for v in global_vars])
+    post_init_ops = []
+    for v in global_vars:
+      split_name = v.name.split('/')
+      # TODO(b/62630508): use more specific prefix than v or v0.
+      if split_name[0] == 'v0' or not v.name.startswith('v'):
+        continue
+      split_name[0] = 'v0'
+      copy_from = var_by_name['/'.join(split_name)]
+      post_init_ops.append(v.assign(copy_from.read_value()))
+    return post_init_ops
+
+  def savable_variables(self):
+    """Return the set of variables used for saving/loading the model."""
+    params = []
+    for v in tf.global_variables():
+      split_name = v.name.split('/')
+      if split_name[0] == 'v0' or not v.name.startswith('v'):
+        params.append(v)
+    return params
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
+
+
+# TODO(tucker): Merge this mode with DistributedAllReduce.
+class VariableMgrCollectiveAllReduce(VariableMgr):
+  """VariableMgr that implements the --collective_all_reduce mode.
+
+     Each GPU has its own copy of the variables. To apply gradients
+     the TF native collective all-reduce op is used to reduce the gradients
+     and replicate the final value to all GPUs.
+  """
+
+  def __init__(self, benchmark_cnn, all_reduce_spec,
+               num_workers, num_gpus, task_id, allreduce_merge_scope):
+    super(VariableMgrCollectiveAllReduce, self).__init__(benchmark_cnn)
+    if not all_reduce_spec:
+      raise ValueError(
+          'collective_all_reduce requires a non-empty all_reduce_spec: %s'
+          % all_reduce_spec)
+    parsed_spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
+    # So far we only support a length-1 all_reduce_spec
+    if len(parsed_spec) > 1 or parsed_spec[0].limit > 0:
+      raise ValueError(
+          'collective_all_reduce requires one single-range all_reduce_spec %s'
+          % parsed_spec)
+    self._all_reduce_spec = parsed_spec[0]
+    if self._all_reduce_spec.alg != 'collective':
+      raise ValueError(
+          'VariableMgrCollectiveAllReduce initialized with non-collective '
+          'all_reduce_spec %s' % self.all_reduce_spec)
+    self._num_workers = num_workers
+    self._num_gpus = num_gpus
+    self._task_id = task_id
+    self._allreduce_merge_scope = allreduce_merge_scope
+    self._instance_key_counter = 10000
+    self._instance_key_table = dict()
+    self._single_session = False
+    # List of prefixes for generating PS devices, unused here.
+    self._all_reduce_device_prefixes = None
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    """Create a scope for the named device.
+
+    Args:
+      device_num: index of device for variable scope.
+
+    Returns:
+      the requested variable_scope
+    """
+    return tf.variable_scope('v%s' % device_num, reuse=self._reuse_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    reduced_grads = allreduce.sum_gradients_all_reduce(
+        self._single_session,
+        self._all_reduce_device_prefixes,
+        device_grads,
+        self._num_workers,
+        'collective',
+        self._all_reduce_spec.shards,
+        self.benchmark_cnn.gpu_indices,
+        allreduce_merge_scope=self._allreduce_merge_scope)
+    assert len(reduced_grads) == len(device_grads)
+    full_device_set = []
+    for grads in device_grads:
+      g, _ = grads[0]
+      full_device_set.append(g.device)
+    return (full_device_set, reduced_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state
+    if device_num >= len(device_grads):
+      raise ValueError('device_num %d exceeds length of device_grads (%d)' %
+                       (device_num, len(device_grads)))
+    return device_grads[device_num]
+
+  def _get_instance_key(self, name):
+    if name not in self._instance_key_table.keys():
+      self._instance_key_counter += 1
+      self._instance_key_table[name] = self._instance_key_counter
+    return self._instance_key_table[name]
+
+  def get_post_init_ops(self):
+    """Broadcast initialized values of variables to other devices.
+
+    Returns:
+      At task 0 device 0, broadcast_send.
+      At all other devices and tasks, broadcast_recv.
+    """
+    global_vars = tf.global_variables()
+    group_size = self._num_workers * self._num_gpus
+    post_init_ops = []
+    # Gather variables into same-var-different-device groups.
+    vars_by_suffix = dict()
+    for v in global_vars:
+      split_name = v.name.split('/')
+      mo = re.match(r'v(\d+)$', split_name[0])
+      if mo:
+        device_id = int(mo.group(1))
+        suffix = '/'.join(split_name[1:])
+        if suffix in vars_by_suffix.keys():
+          vars_by_suffix[suffix].append(v)
+        else:
+          vars_by_suffix[suffix] = [v]
+    # Generate broadcast ops for each such group.
+    for suffix in sorted(vars_by_suffix):
+      vlist = vars_by_suffix[suffix]
+      assert self._num_gpus == len(vlist)
+      devices = [v.device for v in vlist]
+      # NOTE: this key should generate the same value for all tasks
+      group_key = allreduce.collective_group_key(devices)
+      group_size = self._num_workers * len(devices)
+      instance_key = self._get_instance_key(suffix)
+      for v in vlist:
+        split_name = v.name.split('/')
+        mo = re.match(r'v(\d+)$', split_name[0])
+        if mo:
+          device_id = int(mo.group(1))
+          if (self._task_id == 0 and device_id == 0):
+            with tf.device(v.device):
+              bcast_send = allreduce.broadcast_send(
+                  v, v.shape, v.dtype, group_size, group_key, instance_key)
+              post_init_ops.append(v.assign(bcast_send))
+          else:
+            with tf.device(v.device):
+              bcast_recv = allreduce.broadcast_recv(
+                  v.shape, v.dtype, group_size, group_key, instance_key)
+              post_init_ops.append(v.assign(bcast_recv))
+    return post_init_ops
+
+  def savable_variables(self):
+    """Return the set of variables used for saving/loading the model."""
+    params = []
+    if self._task_id == 0:
+      for v in tf.global_variables():
+        split_name = v.name.split('/')
+        if split_name[0] == 'v0' or not v.name.startswith('v'):
+          params.append(v)
+    return params
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
+
+
+class VariableMgrDistributedFetchFromPS(VariableMgr):
+  """Implements --variable_update=parameter_server mode for distributed jobs.
+
+     Variables are stored on a parameter server.  For each step, each tower gets
+     a copy of the variables from the parameter server, and sends its gradients
+     to the param server.
+  """
+
+  def each_tower_has_variables(self):
+    return False
+
+  def create_outer_variable_scope(self, device_num):
+    if self.benchmark_cnn.local_parameter_device_flag == 'gpu':
+      caching_devices = self.benchmark_cnn.raw_devices
+    else:
+      caching_devices = [self.benchmark_cnn.cpu_device]
+    custom_getter = variable_mgr_util.OverrideCachingDevice(
+        caching_devices, self.benchmark_cnn.cpu_device, 1024 * 64)
+    return tf.variable_scope(
+        'v', reuse=bool(device_num) or self._reuse_vars,
+        custom_getter=custom_getter, use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    # Returns (gradient_devices, gradient_state)
+    return ([self.benchmark_cnn.param_server_device], device_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    assert device_num == 0
+    agg_grads, self.grad_has_inf_nan = (
+        variable_mgr_util.aggregate_gradients_using_copy(
+            gradient_state,
+            use_mean=True,
+            check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
+    return agg_grads
+
+  def get_devices(self):
+    ps_strategy = variable_mgr_util.GreedyLoadBalancingStrategy(
+        self.benchmark_cnn.num_ps, variable_mgr_util.byte_size_load_fn)
+    return [
+        tf.train.replica_device_setter(
+            worker_device=d,
+            cluster=self.benchmark_cnn.cluster_manager.get_cluster_spec(),
+            ps_strategy=ps_strategy) for d in self.benchmark_cnn.raw_devices
+    ]
+
+
+class VariableMgrDistributedFetchFromStagedPS(
+    VariableMgrDistributedFetchFromPS):
+  """Extends VariableMgrDistributedFetchFromPS for --staged_vars."""
+
+  def __init__(self, benchmark_cnn):
+    super(VariableMgrDistributedFetchFromStagedPS, self).__init__(benchmark_cnn)
+    self.staging_vars_on_devices = [
+        dict() for _ in self.benchmark_cnn.raw_devices
+    ]
+    self.staged_vars_on_cpu = {}
+
+  def create_outer_variable_scope(self, device_num):
+    self._custom_getter = variable_mgr_util.StagedVariableGetter(
+        device_num, self.benchmark_cnn.raw_devices,
+        self.benchmark_cnn.cpu_device, self)
+    return tf.variable_scope(
+        'v', reuse=bool(device_num) or self._reuse_vars,
+        custom_getter=self._custom_getter, use_resource=self.use_resource_vars)
+
+  def supports_staged_vars(self):
+    return True
+
+  def trainable_variables_on_device(self,
+                                    rel_device_num,
+                                    abs_device_num,
+                                    writable=False):
+    return self._custom_getter.trainable_variables_on_device(
+        rel_device_num, abs_device_num, writable=writable)
+
+
+class VariableMgrDistributedReplicated(VariableMgr):
+  """VariableMgr that implements the --distributed_replicated mode.
+
+     Each GPU has a copy of the variables, and updates its copy after the
+     parameter servers are all updated with the gradients from all servers. Only
+     works with cross_replica_sync=true. Unlike 'replicated', does not use nccl
+     all-reduce for replicating within a server.
+  """
+
+  def each_tower_has_variables(self):
+    return True
+
+  def create_outer_variable_scope(self, device_num):
+    return tf.variable_scope(
+        'v%s' % device_num, reuse=self._reuse_vars,
+        custom_getter=variable_mgr_util.OverrideToLocalVariableIfNotPsVar(),
+        use_resource=self.use_resource_vars)
+
+  def preprocess_device_grads(self, device_grads):
+    return ([self.benchmark_cnn.param_server_device], device_grads)
+
+  def get_gradients_to_apply(self, device_num, gradient_state):
+    device_grads = gradient_state  # From 2nd result of preprocess_device_grads.
+
+    avg_grads, self.grad_has_inf_nan = (
+        variable_mgr_util.aggregate_gradients_using_copy_with_device_selection(
+            self.benchmark_cnn,
+            device_grads,
+            use_mean=True,
+            check_inf_nan=self.benchmark_cnn.enable_auto_loss_scale))
+
+    # Make shadow variable on a parameter server for each original trainable
+    # variable.
+    for i, (g, v) in enumerate(avg_grads):
+      my_name = variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/' + v.name
+      if my_name.endswith(':0'):
+        my_name = my_name[:-2]
+      new_v = tf.get_variable(
+          my_name,
+          dtype=v.dtype.base_dtype,
+          initializer=v.initial_value,
+          trainable=True)
+      avg_grads[i] = (g, new_v)
+    return avg_grads
+
+  def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops,
+                                 loss_scale_params):
+    device_grads = gradient_state  # From 2nd result of preprocess_device_grads.
+
+    def get_apply_gradients_ops_func():
+      """Returns a list of ops for updating gradients."""
+      apply_gradients_ops = []
+      # For each variable, apply the combined gradients for this server on
+      # the parameter server, and then wait for all other servers to do this.
+      for i, (g, v) in enumerate(grads):
+        apply_gradient_op = opt.apply_gradients([(g, v)])
+        barrier = self.benchmark_cnn.add_sync_queues_and_barrier(
+            'replicate_variable_%s' % i, [apply_gradient_op])
+        with tf.control_dependencies([barrier]):
+          with tf.device(self.benchmark_cnn.cpu_device):
+            updated_value = v.read_value()
+            for my_d in range(len(self.benchmark_cnn.devices)):
+              apply_gradients_ops.append(
+                  device_grads[my_d][i][1].assign(updated_value))
+      return apply_gradients_ops
+
+    variable_mgr_util.append_gradients_with_loss_scale(
+        training_ops, get_apply_gradients_ops_func, loss_scale_params,
+        self.grad_has_inf_nan)
+
+  def _strip_port(self, s):
+    if s.endswith(':0'):
+      return s[:-2]
+    return s
+
+  def get_post_init_ops(self):
+    # Copy initialized variables for variables on the parameter server
+    # to the local copy of the variable.
+
+    local_vars = tf.local_variables()
+    local_var_by_name = dict(
+        [(self._strip_port(v.name), v) for v in local_vars])
+    post_init_ops = []
+    for v in tf.global_variables():
+      if v.name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0/'):
+        prefix = self._strip_port(
+            v.name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0'):])
+        for i in range(self.benchmark_cnn.num_gpus):
+          name = 'v%s%s' % (i, prefix)
+          if name in local_var_by_name:
+            copy_to = local_var_by_name[name]
+            post_init_ops.append(copy_to.assign(v.read_value()))
+    return post_init_ops
+
+  def _remove_shadow_var_prefix_if_present(self, var_name):
+    if var_name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):
+      return var_name[len(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/'):]
+    else:
+      return var_name
+
+  def var_dict_name(self, v):
+    return self._strip_port(self._remove_shadow_var_prefix_if_present(v.name))
+
+  def savable_variables(self):
+    """Returns a list/dict of savable variables to pass to tf.train.Saver."""
+    params = {}
+    for v in tf.global_variables():
+      assert (v.name.startswith(variable_mgr_util.PS_SHADOW_VAR_PREFIX + '/v0/')
+              or v.name in ('global_step:0', 'loss_scale:0',
+                            'loss_scale_normal_steps:0')), (
+                                'Invalid global variable: %s' % v)
+      # We store variables in the checkpoint with the shadow variable prefix
+      # removed so we can evaluate checkpoints in non-distributed replicated
+      # mode. The checkpoints can also be loaded for training in
+      # distributed_replicated mode.
+      name = self._strip_port(self._remove_shadow_var_prefix_if_present(v.name))
+      params[name] = v
+    for v in tf.local_variables():
+      # Non-trainable variables, such as batch norm moving averages, do not have
+      # corresponding global shadow variables, so we add them here. Trainable
+      # local variables have corresponding global shadow variables, which were
+      # added in the global variable loop above.
+      if v.name.startswith('v0/') and v not in tf.trainable_variables():
+        params[self._strip_port(v.name)] = v
+    return params
+
+  def get_devices(self):
+    return self.benchmark_cnn.raw_devices
--- a/TensorFlow/ComputeVision/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/variable_mgr_util.py
+++ b/TensorFlow/ComputeVision/Accuracy_Validation/benchmarks-master/scripts/tf_cnn_benchmarks/variable_mgr_util.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for VariableMgr."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as pycoll
+import operator
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+
+
+PS_SHADOW_VAR_PREFIX = 'ps_var'
+
+AutoLossScaleParams = pycoll.namedtuple(
+    'AutoLossScaleParams',
+    [
+        # If true, enable automatic loss scaling.
+        'enable_auto_loss_scale',
+        # The value to scale the loss before computing gradients.
+        'loss_scale',
+        # Number of normal steps with the current `loss_scale`.
+        'loss_scale_normal_steps',
+        # Increase loss scale every n steps.
+        'inc_loss_scale_every_n',
+        # If true, the current worker is chief. The current implementation
+        # relies on the chief to update loss_scale value, but in future, we
+        # might change this to ask the parameter server to update loss_scales
+        # for better performance.
+        # TODO(tanmingxing): remove this if loss_scale is updated in ps.
+        'is_chief',
+    ])
+
+
+def get_loss_scale_update_op(loss_scale, loss_scale_normal_steps,
+                             inc_loss_scale_every_n):
+  """Returns the update op for loss scaling variables.
+
+  We maintain the counter `loss_scale_normal_steps` to count the number of steps
+  we have been using the current `loss_scale`. In most cases, this function
+  increments `loss_scale_normal_steps`. However, if `loss_scale_normal_steps` is
+  greater than the threshold `inc_loss_scale_every_n`, we double `loss_scale`
+  and reset `loss_scale_normal_steps` to zero.
+
+  This op is only called if the gradients don't have any infs or nans. Instead,
+  if infs or nans occur in the gradients, we immeditately halve `loss_scale` and
+  reset `loss_scale_normal_steps` to zero.
+
+  Args:
+    loss_scale: a tf.Variable represneting the loss_scale value.
+    loss_scale_normal_steps: a tf.Variable representing the number of training
+      steps that have run since the loss_scale last changed.
+    inc_loss_scale_every_n: a Python integer threshold. `loss_scale` is
+      increased every `inc_loss_scale_every_n` steps, unless the gradients have
+      infs or nans.
+
+  Returns:
+    An op for updating `loss_scale` and `loss_scale_normal_steps`.
+  """
+
+  def increment_loss_scale_normal_steps_func():
+    return tf.group(loss_scale_normal_steps.assign_add(1))
+
+  def increase_loss_scale_func():
+    return tf.group(
+        tf.assign(loss_scale_normal_steps, 0),
+        tf.assign(loss_scale, loss_scale * 2))
+
+  # true_fn and false_fn must have the same type.
+  return tf.cond(loss_scale_normal_steps < inc_loss_scale_every_n,
+                 increment_loss_scale_normal_steps_func,
+                 increase_loss_scale_func)
+
+
+def append_gradients_with_loss_scale(training_ops, get_apply_gradients_ops_func,
+                                     loss_scale_params, grad_has_inf_nan):
+  """Selectively appends gradients update ops with loss scaling.
+
+  Args:
+    training_ops: a list of training ops to be executed.
+    get_apply_gradients_ops_func: a function that returns a list of ops for
+      applying gradients. Here, we must pass a function instead of the actual
+      list of ops; otherwise, those ops would be executed unconditionally due to
+      the semantics of tf.cond.
+    loss_scale_params: An AutoLossScaleParams tuple.
+    grad_has_inf_nan: Boolean tensor indicating whether the gradients have infs
+      or nans.
+  """
+  is_chief = loss_scale_params.is_chief
+  loss_scale = loss_scale_params.loss_scale
+  loss_scale_normal_steps = loss_scale_params.loss_scale_normal_steps
+  inc_loss_scale_every_n = loss_scale_params.inc_loss_scale_every_n
+  enable_auto_loss_scale = loss_scale_params.enable_auto_loss_scale
+
+  if loss_scale is None or not enable_auto_loss_scale or not is_chief:
+    training_ops.extend(get_apply_gradients_ops_func())
+  else:
+    # If nans/infs occurred, skip applying gradients and instead update
+    # loss_scale (halve loss_scale and reset loss_scale_normal_steps to zero).
+    def update_op_if_nan_or_inf():
+      """Update loss_scale and discard gradients if nans/infs occurred."""
+      return tf.group(
+          tf.assign(loss_scale, loss_scale / 2.),
+          tf.assign(loss_scale_normal_steps, 0))
+
+    # Otherwise, apply gradients, and update loss_scale and
+    # loss_scale_normal_steps.
+    def update_op_if_no_nan_or_inf():
+      """Apply gradients, and update loss scaling."""
+      return tf.group(
+          get_loss_scale_update_op(loss_scale, loss_scale_normal_steps,
+                                   inc_loss_scale_every_n),
+          *get_apply_gradients_ops_func())
+
+    # TODO(tanmingxing): Add support for independent and distributed all_reduce.
+    assert grad_has_inf_nan is not None
+    update_op = tf.cond(
+        grad_has_inf_nan,
+        update_op_if_nan_or_inf,
+        update_op_if_no_nan_or_inf,
+        name='cond_if_grad_has_inf_nan'
+    )
+    training_ops.append(update_op)
+
+
+# To be used with custom_getter on tf.get_variable.
+class OverrideCachingDevice(object):
+  """Variable getter which caches variables on the least loaded device.
+
+  Variables smaller than a certain threshold are cached on a single specific
+  device, as specified in the constructor. All other variables are load balanced
+  across a pool of devices, by caching each variable on the least loaded device.
+
+  Note that variable creation only happen when building the model graph on the
+  first device (see how it sets the 'reuse' parameter in
+  VariableMgr.*.create_outer_variable_scope()). That means, for all other
+  devices, the variable scope will reuse the variables created before, which
+  requires that we set the caching_device correctly as otherwise it may not be
+  able to find the previously created variable and will create a new one. This
+  requires when building the model graph on different devices, variables with
+  the same name should have same size.
+
+  TODO(laigd): consider adding tests or verification logic to enforce this, or
+  refactor it.
+  """
+
+  def __init__(self, devices, device_for_small_variables,
+               small_variable_size_threshold):
+    self.devices = devices
+    self.sizes = [0] * len(self.devices)
+    self.device_for_small_variables = device_for_small_variables
+    self.small_variable_size_threshold = small_variable_size_threshold
+
+  def __call__(self, getter, *args, **kwargs):
+    size = tf.TensorShape(kwargs['shape']).num_elements()
+    if size < self.small_variable_size_threshold:
+      device_name = self.device_for_small_variables
+    else:
+      device_index, _ = min(enumerate(self.sizes), key=operator.itemgetter(1))
+      device_name = self.devices[device_index]
+      self.sizes[device_index] += size
+
+    kwargs['caching_device'] = device_name
+    var = getter(*args, **kwargs)
+    return var
+
+
+# To be used with custom_getter on tf.get_variable. Ensures the created variable
+# is in LOCAL_VARIABLES and not GLOBAL_VARIBLES collection.
+class OverrideToLocalVariableIfNotPsVar(object):
+
+  # args and kwargs come from the custom_getter interface for Tensorflow
+  # variables, and matches tf.get_variable's signature, with the addition of
+  # 'getter' at the beginning.
+  def __call__(self, getter, name, *args, **kwargs):
+    if name.startswith(PS_SHADOW_VAR_PREFIX):
+      return getter(*args, **kwargs)
+
+    if 'collections' in kwargs:
+      collections = kwargs['collections']
+    if not collections:
+      collections = [tf.GraphKeys.GLOBAL_VARIABLES]
+    else:
+      collections = collections[:]
+    collections.remove(tf.GraphKeys.GLOBAL_VARIABLES)
+    collections.append(tf.GraphKeys.LOCAL_VARIABLES)
+    kwargs['collections'] = list(collections)
+    return getter(name, *args, **kwargs)
+
+
+class ParamServerDeviceSetter(object):
+  """Helper class to assign variables on the least loaded ps-device."""
+
+  def __init__(self, worker_device, ps_devices):
+    """Initializer for ParamServerDevicSetter.
+
+    Args:
+      worker_device: the device to use for computer ops.
+      ps_devices: a list of device to use for Variable ops. Each variable is
+      assigned to the least loaded device.
+    """
+    self.ps_devices = ps_devices
+    self.worker_device = worker_device
+    self.ps_sizes = [0] * len(self.ps_devices)
+
+  def __call__(self, op):
+    if op.device:
+      return op.device
+    if op.type not in ['Variable', 'VariableV2']:
+      return self.worker_device
+
+    device_index, _ = min(enumerate(self.ps_sizes), key=operator.itemgetter(1))
+    device_name = self.ps_devices[device_index]
+    var_size = op.outputs[0].get_shape().num_elements()
+    self.ps_sizes[device_index] += var_size
+
+    return device_name
+
+
+class StagedModelVariable(object):
+  """Staging variable wrapper that decouples reads and updates.
+
+  This class represents a variable through a staging buffer. Reads from this
+  variable directly gets from the staging buffer. Updates are stacked into
+  another staging buffer, and will be processed later.
+  """
+
+  def __init__(self, real_var, var_stage_get, variable_mgr):
+    """Initializer for the model variables through a staging buffer.
+
+    Args:
+      real_var: the underlying real variable.
+      var_stage_get: the read op from the staging buffer.
+      variable_mgr: the parent variable-manager.
+    """
+    self.real_var = real_var
+    self.var_stage_get = var_stage_get
+    self.variable_mgr = variable_mgr
+
+  def _value(self):
+    """The read access of this variable. The content from the staging buffer."""
+    return self.var_stage_get
+
+  def _ref(self):
+    """Return the underlying variable ref, required by tf.colocate_with."""
+    return self.real_var._ref()  # pylint: disable=protected-access
+
+  def read_value(self):
+    """Mimics tf.Variable.read_value()."""
+    return tf.identity(self.var_stage_get, name='read')
+
+  @property
+  def dtype(self):
+    """Return the non-reference dtype."""
+    return self.var_stage_get.dtype
+
+  def assign_sub(self, delta, name=None, read_value=True):
+    """Mimic the updates to the variable.
+
+    Args:
+      delta: is pushed into a staging buffer and will be pumped later.
+      name: currently ignored; names of ops and the StagingArea are
+            computed without using this pass name.
+      read_value: if True, will return something which evaluates to the new
+              value of the variable; if False will return the assign op.
+    Returns:
+      The actual updates. The colocation constraint will be reapplied.
+    """
+    # This parameter is ignored: the StagingArea only supports setting
+    # the shared name, not the names of individual ops it uses.
+    del name
+
+    # colocate_with(None, True) clears the colocation constraints.
+    # Push the delta into a staging buffer.
+    with ops.colocate_with(None, True), tf.device(self.var_stage_get.device):
+      delta_staging_area = data_flow_ops.StagingArea(
+          [self.var_stage_get.dtype], shapes=[self.var_stage_get.shape])
+      delta_put_op = delta_staging_area.put([delta])
+      self.variable_mgr.staging_delta_ops.append(delta_put_op)
+      delta_get_op = delta_staging_area.get()[0]
+    # Return the actual updates. The colocation constraint will be reapplied.
+    return self.real_var.assign_sub(delta_get_op, read_value=read_value)
+
+  @staticmethod
+  # pylint: disable=bad-staticmethod-argument,invalid-name
+  def _TensorConversionFunction(self, dtype=None, name=None, as_ref=False):
+    """Utility function for converting a StagedModelVariable to a Tensor."""
+    del dtype, name  # unused: this function returns the cached ref or value.
+    if as_ref:
+      return self._ref()
+    else:
+      return self._value()
+
+
+ops.register_tensor_conversion_function(
+    StagedModelVariable, StagedModelVariable._TensorConversionFunction)  # pylint: disable=protected-access
+
+
+class StagedVariableGetter(object):
+  """A variable getter through staging buffers on devices.
+
+  Instead of a caching device, this getter tracks where the variable is used.
+  And on each device, it goes through a staging buffer.
+  """
+
+  def __init__(self, device_num, devices, cpu_device, variable_mgr):
+    """Initializer for StagedVariableGetter.
+
+    Args:
+      device_num: the current device index.
+      devices: a list of all the devices to build towers.
+      cpu_device: a cpu_device for this replica. If None, no cpu-caching is
+          done.
+      variable_mgr: the parent variable manager.
+    """
+    self.device_num = device_num
+    self.devices = devices
+    self.cpu_device = cpu_device
+    self.variable_mgr = variable_mgr
+
+  def __call__(self, getter, name, *args, **kwargs):
+    staging_ops = self.variable_mgr.staging_vars_on_devices[self.device_num]
+    if name in staging_ops:
+      put_op, get_op = staging_ops[name]
+      return get_op
+    real_var = getter(name, *args, **kwargs)
+    shape = kwargs['shape']
+    dtype = kwargs['dtype']
+    trainable = kwargs['trainable']
+    if self.cpu_device:
+      with tf.device(self.cpu_device):
+        # This helps copying the weights from the parameter to this server only
+        # once.
+        if name in self.variable_mgr.staged_vars_on_cpu:
+          cpu_var = self.variable_mgr.staged_vars_on_cpu[name]
+        else:
+          cpu_var = tf.identity(real_var)
+          self.variable_mgr.staged_vars_on_cpu[name] = cpu_var
+      var_to_stage = cpu_var
+    else:
+      var_to_stage = tf.identity(real_var)  # de-reference the variable.
+
+    with tf.device(self.devices[self.device_num]):
+      staging_area = data_flow_ops.StagingArea([dtype], shapes=[shape])
+      put_op = staging_area.put([var_to_stage])
+      get_op = staging_area.get()[0]
+      staging_ops[name] = (put_op, get_op)
+    if trainable:
+      # For trainable variables, they are managed separatedly through
+      # apply_gradients.
+      return get_op
+    else:
+      # For other shadow variables, the access is decoupled through a wrapper
+      # class.
+      return StagedModelVariable(real_var, get_op, self.variable_mgr)
+
+  def trainable_variables_on_device(self, rel_device_num, abs_device_num,
+                                    writable):
+    """Return the set of trainable variables on the specified device.
+
+    Args:
+      rel_device_num: local worker device index.
+      abs_device_num: global graph device index.
+      writable: whether the returned variables is writable or read-only.
+
+    Returns:
+      Return the set of trainable variables on the specified device.
+    """
+    del abs_device_num
+    params_refs = tf.trainable_variables()
+    if writable:
+      return params_refs
+    params = []
+    for param in params_refs:
+      var_name = param.name.split(':')[0]
+      _, var_get_op = self.variable_mgr.staging_vars_on_devices[rel_device_num][
+          var_name]
+      params.append(var_get_op)
+    return params
+
+
+def aggregate_gradients_using_copy_with_device_selection(
+    benchmark_cnn, tower_grads, use_mean, check_inf_nan):
+  """Aggregate gradients, controlling device for the aggregation.
+
+  Args:
+    benchmark_cnn: benchmark_cnn class.
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over towers. The inner list is over individual gradients.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: If true, check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  if benchmark_cnn.local_parameter_device_flag == 'gpu':
+    avail_devices = benchmark_cnn.raw_devices
+  else:
+    avail_devices = [benchmark_cnn.param_server_device]
+  agg_grads = []
+  has_nan_or_inf_list = []
+  for i, single_grads in enumerate(zip(*tower_grads)):
+    with tf.device(avail_devices[i % len(avail_devices)]):
+      grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy(
+          single_grads, use_mean, check_inf_nan)
+      agg_grads.append(grad_and_var)
+      has_nan_or_inf_list.append(has_nan_or_inf)
+  if check_inf_nan:
+    return agg_grads, tf.reduce_any(has_nan_or_inf_list)
+  else:
+    return agg_grads, None
+
+
+def aggregate_gradients_using_copy_with_variable_colocation(
+    tower_grads, use_mean, check_inf_nan):
+  """Aggregate gradients, colocating computation with the gradient's variable.
+
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over towers. The inner list is over individual gradients. All variables
+      of the same gradient across towers must be the same (that is,
+      tower_grads[x][a][1] == tower_grads[y][a][1] for all indices x, y, and a)
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: If true, check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  agg_grads = []
+  has_nan_or_inf_list = []
+  for single_grads in zip(*tower_grads):
+    # Note that each single_grads looks like the following:
+    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+    var = single_grads[0][1]
+
+    for _, v in single_grads:
+      assert v == var
+
+    with tf.device(var.device):
+      grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy(
+          single_grads, use_mean, check_inf_nan)
+      agg_grads.append(grad_and_var)
+      has_nan_or_inf_list.append(has_nan_or_inf)
+
+  if check_inf_nan:
+    return agg_grads, tf.reduce_any(has_nan_or_inf_list)
+  else:
+    return agg_grads, None
+
+
+def aggregate_gradients_using_copy(tower_grads, use_mean, check_inf_nan):
+  """Calculate the average gradient for each shared variable across all towers.
+
+  Note that this function provides a synchronization point across all towers.
+
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over towers. The inner list is over individual gradients.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  agg_grads = []
+  has_nan_or_inf_list = []
+
+  for single_grads in zip(*tower_grads):
+    grad_and_var, has_nan_or_inf = aggregate_single_gradient_using_copy(
+        single_grads, use_mean, check_inf_nan)
+    agg_grads.append(grad_and_var)
+    has_nan_or_inf_list.append(has_nan_or_inf)
+
+  if check_inf_nan:
+    return agg_grads, tf.reduce_any(has_nan_or_inf_list)
+  else:
+    return agg_grads, None
+
+
+# The following two functions are copied from
+# tensorflow/python/eager/backprop.py. We do not directly use them as they are
+# not exported and subject to change at any time.
+def flatten_nested_indexed_slices(grad):
+  assert isinstance(grad, ops.IndexedSlices)
+  if isinstance(grad.values, ops.Tensor):
+    return grad
+  else:
+    assert isinstance(grad.values, ops.IndexedSlices)
+    g = flatten_nested_indexed_slices(grad.values)
+    return ops.IndexedSlices(g.values, array_ops.gather(grad.indices,
+                                                        g.indices),
+                             g.dense_shape)
+
+
+def aggregate_indexed_slices_gradients(grads):
+  """Aggregates gradients containing `IndexedSlices`s."""
+  if len(grads) < 1:
+    return None
+  elif len(grads) == 1:
+    return grads[0]
+  else:
+    grads = [g for g in grads if g is not None]
+    # If any gradient is a `Tensor`, sum them up and return a dense tensor
+    # object.
+    if any(isinstance(g, ops.Tensor) for g in grads):
+      return math_ops.add_n(grads)
+
+    # The following `_as_indexed_slices_list` casts ids of IndexedSlices into
+    # int64. It is to make sure the inputs of `concat` all have same the data
+    # type.
+    grads = math_ops._as_indexed_slices_list(grads)  # pylint: disable=protected-access
+
+    grads = [flatten_nested_indexed_slices(x) for x in grads]
+    # Form IndexedSlices out of the concatenated values and indices.
+    concat_grad = ops.IndexedSlices(
+        array_ops.concat([x.values for x in grads], axis=0),
+        array_ops.concat([x.indices for x in grads], axis=0),
+        grads[0].dense_shape)
+
+    return concat_grad
+
+
+def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
+                                         check_inf_nan):
+  """Calculate the average gradient for a shared variable across all towers.
+
+  Note that this function provides a synchronization point across all towers.
+
+  Args:
+    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
+      (gradient, variable) pair within the outer list represents the gradient
+      of the variable calculated for a single tower, and the number of pairs
+      equals the number of towers.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  grads = [g for g, _ in grad_and_vars]
+  if any(isinstance(g, tf.IndexedSlices) for g in grads):
+    # TODO(reedwm): All-reduce IndexedSlices more effectively.
+    grad = aggregate_indexed_slices_gradients(grads)
+  else:
+    grad = tf.add_n(grads)
+
+  if use_mean and len(grads) > 1:
+    grad = tf.scalar_mul(1.0 / len(grads), grad)
+
+  v = grad_and_vars[0][1]
+  if check_inf_nan:
+    with tf.name_scope('check_for_inf_and_nan'):
+      has_nan_or_inf = tf.logical_not(tf.reduce_all(tf.is_finite(grads)))
+    return (grad, v), has_nan_or_inf
+  else:
+    return (grad, v), None
+
+
+# This class is copied from
+# https://github.com/tensorflow/tensorflow/blob/590d6eef7e91a6a7392c8ffffb7b58f2e0c8bc6b/tensorflow/contrib/training/python/training/device_setter.py#L56.
+# We copy it since contrib has been removed from TensorFlow.
+class GreedyLoadBalancingStrategy(object):
+  """Returns the least-loaded ps task for op placement.
+
+  The load is calculated by a user-specified load function passed in at
+  construction.  There are no units for load, and the load function is
+  responsible for providing an internally consistent measure.
+
+  Note that this strategy is very sensitive to the exact order in which
+  ps ops (typically variables) are created, as it greedily places ops
+  on the least-loaded ps at the point each op is processed.
+
+  One reasonable heuristic is the `byte_size_load_fn`, which
+  estimates load as the number of bytes that would be used to store and
+  transmit the entire variable.  More advanced load functions
+  could consider the difference in access patterns across ops, or trade
+  off CPU-intensive ops with RAM-intensive ops with network bandwidth.
+
+  This class is intended to be used as a `ps_strategy` in
+  `tf.compat.v1.train.replica_device_setter`.
+  """
+
+  def __init__(self, num_tasks, load_fn):
+    """Create a new `LoadBalancingStrategy`.
+
+    Args:
+      num_tasks: Number of ps tasks to cycle among.
+      load_fn: A callable that takes an `Operation` and returns a
+        numeric load value for that op.
+    """
+    self._num_tasks = num_tasks
+    self._load_fn = load_fn
+    self._ps_loads = np.zeros(num_tasks)
+
+  def __call__(self, op):
+    """Choose a ps task index for the given `Operation`.
+
+    Args:
+      op: A `Operation` to be placed on ps.
+
+    Returns:
+      The next ps task index to use for the `Operation`. Greedily
+      places the op on the least-loaded ps task so far, as determined
+      by the load function.
+    """
+    task = np.argmin(self._ps_loads)
+    self._ps_loads[task] += self._load_fn(op)
+    return task
+
+
+# This function is copied from
+# https://github.com/tensorflow/tensorflow/blob/590d6eef7e91a6a7392c8ffffb7b58f2e0c8bc6b/tensorflow/contrib/training/python/training/device_setter.py#L105.
+# We copy it since contrib has been removed from TensorFlow.
+def byte_size_load_fn(op):
+  """Load function that computes the byte size of a single-output `Operation`.
+
+  This is intended to be used with `"Variable"` ops, which have a single
+  `Tensor` output with the contents of the variable.  However, it can also be
+  used for calculating the size of any op that has a single output.
+
+  Intended to be used with `GreedyLoadBalancingStrategy`.
+
+  Args:
+    op: An `Operation` with a single output, typically a "Variable" op.
+
+  Returns:
+    The number of bytes in the output `Tensor`.
+
+  Raises:
+    ValueError: if `op` does not have a single output, or if the shape of the
+      single output is not fully-defined.
+  """
+  if len(op.outputs) != 1:
+    raise ValueError('Op %s must have a single output' % op)
+  output = op.outputs[0]
+  elem_size = output.dtype.size
+  shape = output.get_shape()
+  if not shape.is_fully_defined():
+    # Due to legacy behavior, scalar "Variable" ops have output Tensors that
+    # have unknown shape when the op is created (and hence passed to this
+    # load function for placement), even though the scalar shape is set
+    # explicitly immediately afterward.
+    shape = tensor_shape.TensorShape(op.get_attr('shape'))
+  shape.assert_is_fully_defined()
+  return shape.num_elements() * elem_size
+