update TensorFlow2x test method

a32ffa95 · qianyj · e286da17 · e286da17 · e286da17 · e286da17
Commit a32ffa95 authored Feb 03, 2023 by qianyj
20 changed files
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tensorflow_profiler.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tensorflow_profiler.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Collect profiler data for Tensorboard with a separate thread."""
-
-from __future__ import print_function
-
-import logging
-import os
-import sched
-import threading
-import time
-import traceback
-
-import perfzero.utils as utils
-
-
-def _start_profiler(output_dir):
-  """Start profiler.
-
-  Args:
-    output_dir: log directory to place the profiler data
-  """
-  import tensorflow as tf  # pylint: disable=g-import-not-at-top
-
-  profiler_data_dir = os.path.join(output_dir, 'profiler_data')
-  utils.make_dir_if_not_exist(profiler_data_dir)
-  logging.info('Starting TensorFlow profiler and saving data to dir %s',
-                 profiler_data_dir)
-  try:
-    tf.profiler.experimental.start(profiler_data_dir)
-    logging.info('Started TensorFlow profiler')
-  except Exception:  # pylint: disable=broad-except
-    logging.error('TensorFlow profiler failed to start due to error:\n %s',
-                  traceback.format_exc())
-
-
-def _stop_profiler():
-  """Stop profiler."""
-
-  import tensorflow as tf  # pylint: disable=g-import-not-at-top
-
-  try:
-    tf.profiler.experimental.stop()
-    logging.info('Stopped TensorFlow profiler.')
-  except Exception:  # pylint: disable=broad-except
-    logging.error('TensorFlow profiler failed to stop due to error:\n %s',
-                  traceback.format_exc())
-
-
-class TensorFlowProfiler(object):
-  """Collect profiler data for Tensorboard with a separate thread."""
-
-  def __init__(self, profiler_enabled_time_str, output_dir):
-    """Constructor.
-
-    Args:
-      profiler_enabled_time_str: the value of the config --profiler_enabled_time
-      output_dir: log directory to place the profiler data
-    """
-
-    self.profiler_enabled_time_str = profiler_enabled_time_str
-    self.output_dir = output_dir
-    self.exit_event = threading.Event()
-    self.scheduler = sched.scheduler(time.time, self._sleep_until_exit)
-
-  def _sleep_until_exit(self, timeout):
-    start_time = time.time()
-    cur_time = time.time()
-    while cur_time - start_time < timeout and not self.exit_event.is_set():
-      time.sleep(min(1, timeout + start_time - cur_time))
-      cur_time = time.time()
-
-  def start(self):
-    """Schedule start/stop profiler event specified in profiler_enabled_time_str."""
-
-    if not self.profiler_enabled_time_str:
-      return
-
-    last_end_time = -1
-    for time_str in self.profiler_enabled_time_str.split(','):
-      begin_time = int(time_str.split(':')[0].strip())
-      end_time_str = time_str.split(':')[1].strip() if ':' in time_str else None
-      end_time = int(end_time_str) if end_time_str else 365 * 24 * 60 * 60
-      if begin_time <= last_end_time:
-        raise ValueError('begin_time {} is no larger than the last '
-                         'end_time {}'.format(begin_time, last_end_time))
-      if end_time <= begin_time:
-        raise ValueError('end_time {} is no larger than begin_time {}'.format(
-            end_time, begin_time))
-      # 4th positional arg added to support Python2 for the short-term.
-      self.scheduler.enter(begin_time, 1, _start_profiler,
-        argument=(self.output_dir,))
-      self.scheduler.enter(end_time, 1, _stop_profiler, ())  # pylint: disable=no-value-for-parameter
-      last_end_time = end_time
-
-    threading.Thread(target=self.scheduler.run).start()
-
-  def stop(self):
-    """Stop scheduler and save profiler data if any event is cancelled."""
-
-    event_canceled = False
-    for event in self.scheduler.queue:
-      try:
-        self.scheduler.cancel(event)
-        event_canceled = True
-      except ValueError:
-        # This is OK because the event may have been just canceled
-        pass
-
-    # Signal the scheduler thread to stop sleeping
-    self.exit_event.set()
-
-    # Save the profiler data if any event is canceled
-    if event_canceled:
-      _stop_profiler()
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_no_processes.txt
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_no_processes.txt
-Tue Jan  9 09:34:25 2018
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 384.81                 Driver Version: 384.81                    |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|===============================+======================+======================|
-|   0  Tesla P100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
-| N/A   50C    P0   196W / 300W |  15643MiB / 16276MiB |     97%      Default |
-+-------------------------------+----------------------+----------------------+
-|   1  Tesla P100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
-| N/A   41C    P0    50W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   2  Tesla P100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
-| N/A   33C    P0    48W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   3  Tesla P100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |
-| N/A   34C    P0    49W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   4  Tesla P100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |
-| N/A   36C    P0    50W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   5  Tesla P100-SXM2...  On   | 00000000:86:00.0 Off |                    0 |
-| N/A   33C    P0    48W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   6  Tesla P100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |
-| N/A   38C    P0    48W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   7  Tesla P100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |
-| N/A   34C    P0    49W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-
-+-----------------------------------------------------------------------------+
-| Processes:                                                       GPU Memory |
-|  GPU       PID   Type   Process name                             Usage      |
-|=============================================================================|
-|  No running processes found                                                 |
-+-----------------------------------------------------------------------------+
-
-
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_processes.txt
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/example_nvidia-smi_processes.txt
-Tue Jan  9 09:34:25 2018
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 384.81                 Driver Version: 384.81                    |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|===============================+======================+======================|
-|   0  Tesla P100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
-| N/A   50C    P0   196W / 300W |  15643MiB / 16276MiB |     97%      Default |
-+-------------------------------+----------------------+----------------------+
-|   1  Tesla P100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
-| N/A   41C    P0    50W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   2  Tesla P100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
-| N/A   33C    P0    48W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   3  Tesla P100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |
-| N/A   34C    P0    49W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   4  Tesla P100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |
-| N/A   36C    P0    50W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   5  Tesla P100-SXM2...  On   | 00000000:86:00.0 Off |                    0 |
-| N/A   33C    P0    48W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   6  Tesla P100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |
-| N/A   38C    P0    48W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-|   7  Tesla P100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |
-| N/A   34C    P0    49W / 300W |  15483MiB / 16276MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-
-+-----------------------------------------------------------------------------+
-| Processes:                                                       GPU Memory |
-|  GPU       PID   Type   Process name                             Usage      |
-|=============================================================================|
-|    0     44454      C   /usr/bin/python                            15631MiB |
-|    1     44454      C   /usr/bin/python                            15471MiB |
-|    2     44454      C   /usr/bin/python                            15471MiB |
-|    3     44454      C   /usr/bin/python                            15471MiB |
-+-----------------------------------------------------------------------------+
-
-
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/nvme_device_log.txt
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/test_files/nvme_device_log.txt
-NAME    MAJ:MIN RM   SIZE RO TYPE MOUNTPOINT
-nvme0n8 259:7    0   375G  0 disk
-nvme0n6 259:5    0   375G  0 disk
-sdb       8:16   0    50G  0 disk
-
-└─sdb1    8:17   0    50G  0 part /tmpfs
-nvme0n4 259:3    0   375G  0 disk
-nvme0n2 259:1    0   375G  0 disk
-nvme0n7 259:6    0   375G  0 disk
-nvme0n5 259:4    0   375G  0 disk
-sda       8:0    0   100G  0 disk
-
-└─sda1    8:1    0   100G  0 part /
-nvme0n3 259:2    0   375G  0 disk
-nvme0n1 259:0    0   375G  0 disk
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tpu_runtime_utils.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tpu_runtime_utils.py
-"""Utility to manage the tpu version before starting the benchmark."""
-
-import json
-from absl import logging
-
-from six.moves.urllib import request
-
-try:
-  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
-except ImportError:
-  print(
-      'Falling back to TensorFlow client; we recommended you install the Cloud '
-      'TPU client directly with pip install cloud-tpu-client.')
-  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
-
-
-def _as_text(s):
-  """Converts a byte/string into string."""
-  if isinstance(s, bytes):
-    return s.decode('utf-8')
-  return s
-
-
-def _get_content(url):
-  """Opens the url and loads the response into json."""
-  logging.info('opening url %s', url)
-  req = request.Request(url)
-  resp = request.urlopen(req)
-  resp_text = _as_text(resp.read())
-  logging.info('response text = %s', resp_text)
-  return json.loads(resp_text)
-
-
-def _get_version_info(url, version_label):
-  """Constructs a version info from the response."""
-  json_data = _get_content(url)
-  logging.info('json_data = %s', json_data)
-  if 'currentVersion' in json_data:
-    commit_id = json_data['currentVersion']
-  elif 'buildLabel' in json_data:
-    commit_id = json_data['buildLabel']
-  else:
-    commit_id = ''
-    
-  info = {
-      'url': '',
-      'hash': commit_id,
-      'branch': version_label,
-      'piper_id': json_data.get('piperOriginRevId', '')
-  }
-  return info
-
-
-
-def _configure_tpu_version(tpu_name, version_label, new_version_id):
-  """Returns the current tpu version after resetting to an optional version."""
-  # The tpu_name is arbitrary / user chosen unique string for this tpu.
-  logging.info('Trying to connect to tpu %s', tpu_name)
-  tpu_client = client.Client(tpu=tpu_name)
-  tpu_client.wait_for_healthy()
-
-  if new_version_id:
-    logging.info('Trying to reset tpu version to %s', new_version_id)
-    tpu_client.configure_tpu_version(version=new_version_id)
-    tpu_client.wait_for_healthy()
-    logging.info('TPU healthy after version reset.')
-  else:
-    logging.info('Using the default tpu version id.')
-
-  workers = tpu_client.network_endpoints()
-  if workers:
-    ip_addr = workers[0]['ipAddress']
-    url = 'http://{}:8475/requestversion'.format(ip_addr)
-    return _get_version_info(url, version_label)
-  else:
-    logging.error('No tpu endpoint info')
-    return {
-        'url': '',
-        'hash': '',
-        'branch': version_label,
-        'piper_id': '',
-    }
-
-
-def configure_tpu(tpu_params):
-  return _configure_tpu_version(
-      tpu_params.get('name'),
-      version_label=tpu_params.get('version'),
-      new_version_id=tpu_params.get('version_id'))
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""PerfZero utility methods."""
-from __future__ import print_function
-
-import importlib
-import logging
-import os
-import shutil
-import subprocess
-import sys
-import threading
-import traceback
-import requests
-import json
-import re
-
-def create_empty_file(parent_directory, file_basename):
-  """Creates an empty file with a given basename in a parent directory.
-
-  Creates parent_directory and intermediate directories if it doesn't exist.
-  This is mostly used for creating no-op actions in the Dockerfile.
-
-  Args:
-    parent_directory: The path to the parent directory.
-    file_basename: The basename for the empty file.
-  """
-  if not os.path.isdir(parent_directory):
-    os.makedirs(parent_directory)
-  full_file_name = os.path.join(parent_directory, file_basename)
-  with open(full_file_name, 'w'):
-    print('Creating empty file: {}'.format(full_file_name))
-
-
-def checkout_git_repos(git_repos, use_cached_site_packages):
-  """Clone, update, or sync a repo.
-
-  Args:
-    git_repos: array of dict containing attributes of the git repo to checkout.
-    use_cached_site_packages: If true, skip git pull if git_repo already exists.
-
-  Returns:
-    A dict containing attributes of the git repositories
-  """
-  site_package_info = {}
-  for repo in git_repos:
-    logging.info('Checking out repository from %s to %s',
-                 repo['url'], repo['local_path'])
-    if not os.path.isdir(repo['local_path']):
-      run_commands(['git clone {} {}'.format(repo['url'], repo['local_path'])])
-    if 'branch' in repo:
-      run_commands(['git -C {} checkout {}'.format(
-          repo['local_path'], repo['branch'])])
-    if not use_cached_site_packages or 'git_hash' in repo:
-      run_commands(['git -C {} pull --rebase'.format(repo['local_path'])])
-    if 'git_hash' in repo:
-      run_commands(['git -C {} reset --hard {}'.format(
-          repo['local_path'], repo['git_hash'])])
-    logging.info('Checked-out repository from %s to %s',
-                 repo['url'], repo['local_path'])
-    site_package_info[repo['dir_name']] = get_git_repo_info(repo['local_path'])
-
-  return site_package_info
-
-
-def get_git_repo_info(local_path):
-  """Get information of the git repository specified by the local_path."""
-  git_repo_info = {}
-
-  # Get git url
-  cmd = 'git -C {} config --get remote.origin.url'.format(local_path)
-  exit_code, result = run_command(cmd)
-  lines = result.splitlines()
-  if exit_code == 0 and lines:
-    git_repo_info['url'] = lines[0]
-  else:
-    logging.error('Error getting git url for repository %s due to %s',
-                  local_path, result)
-    return {}
-
-  # Get git branch
-  cmd = 'git -C {} rev-parse --abbrev-ref HEAD'.format(local_path)
-  exit_code, result = run_command(cmd)
-  lines = result.splitlines()
-  if exit_code == 0 and lines:
-    git_repo_info['branch'] = lines[0]
-  else:
-    logging.error('Error getting git branch for repository %s due to %s',
-                  local_path, result)
-    return {}
-
-  # Get git hash
-  cmd = 'git -C {} rev-parse HEAD'.format(local_path)
-  exit_code, result = run_command(cmd)
-  lines = result.splitlines()
-  if exit_code == 0 and lines:
-    git_repo_info['hash'] = lines[0]
-  else:
-    logging.error('Error getting git hash for repository %s due to %s',
-                  local_path, result)
-    return {}
-
-  return git_repo_info
-
-
-def setup_python_path(site_packages_dir, python_path_str):
-  if python_path_str:
-    python_paths = python_path_str.split(',')
-    for python_path in python_paths:
-      logging.info('Adding path %s to sys.path', python_path)
-      sys.path.append(os.path.join(site_packages_dir, python_path))
-  logging.debug('PYTHONPATH: %s', sys.path)
-
-
-def active_gcloud_service(gcloud_key_file_url, workspace_dir,
-                          download_only=False):
-  """Download key file and setup gcloud service credential using the key file.
-
-  Args:
-    gcloud_key_file_url: gcloud key file url
-    workspace_dir: directory that the key file is downloaded to
-    download_only: skip setting up the gcloud service credential if this is true
-  """
-
-  if not gcloud_key_file_url:
-    return
-
-  local_path = os.path.join(workspace_dir,
-                            os.path.basename(gcloud_key_file_url))
-  if not os.path.exists(local_path):
-    download_data([{'url': gcloud_key_file_url, 'local_path': local_path}])
-
-  if not download_only:
-    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = local_path
-    run_commands(['gcloud auth activate-service-account --key-file {}'.format(
-        local_path)])
-    logging.info('Activated gcloud service account credential')
-
-
-def setup_gsutil_credential():
-  run_commands(['gcloud config set pass_credentials_to_gsutil true'])
-
-
-def download_data(download_infos):
-  """Download data from url to local_path for each (url, local_path) pair in the download_infos.
-
-  Each url should start with either gs://, http:// or https://
-  Downloaded file whose name ends with .gz will be decompressed in its
-  current directory
-
-  Args:
-    download_infos: array of dict which specifies the url and local_path for
-      data download
-  """
-  for info in download_infos:
-    if os.path.exists(info['local_path']):
-      continue
-    original_base_name = os.path.basename(info['url'])
-    expected_base_name = os.path.basename(info['local_path'])
-    local_path_parent = os.path.dirname(info['local_path'])
-
-    logging.info('Downloading data from %s to %s',
-                 info['url'], info['local_path'])
-    make_dir_if_not_exist(local_path_parent)
-    # Download data to the local path
-    if info['url'].startswith('http://') or info['url'].startswith('https://'):
-      request = requests.get(info['url'], allow_redirects=True)
-      f = open(info['local_path'], 'wb')
-      f.write(request.content)
-      f.close()
-    elif info['url'].startswith('gs://'):
-      cmd = ['gsutil', '-m', 'cp', '-r', '-n', info['url'], local_path_parent]
-      run_commands([cmd], shell=False)
-    elif info['url'].startswith('file://'):
-      cmd = ['cp', info['url'][7:], local_path_parent]
-      run_commands([cmd], shell=False)
-    else:
-      raise ValueError('Url {} with prefix {} is not supported.'.format(
-          info['url'], info['url'].split(':')[0]))
-    # Move data to the expected local path
-    if original_base_name != expected_base_name:
-      run_commands(['mv {} {}'.format(
-          os.path.join(local_path_parent, original_base_name),
-          os.path.join(local_path_parent, expected_base_name))])
-    logging.info('Downloaded data from %s to %s',
-                 info['url'], info['local_path'])
-    # Decompress file if file name ends with .gz unless caller sets 'decompress'
-    # to False in info.
-    if info['url'].endswith('.gz') and info.get('decompress', True):
-      run_commands(['tar xvf {} -C {}'.format(
-          info['local_path'], local_path_parent)])
-      logging.info('Decompressed file %s', info['local_path'])
-
-
-def parse_data_downloads_str(root_data_dir, data_downloads_str):
-  """Parse a comma separated string into array of dicts.
-
-  Each dict specifies the url and local_path for a download.
-
-  Args:
-    root_data_dir: the directory which should contain all the dataset files
-    data_downloads_str: a comma separated string specified by the
-      flag --data_downloads
-
-  Returns:
-    An array of dict which specifies the url and local_path for data download
-  """
-
-  download_infos = []
-  if not data_downloads_str:
-    return download_infos
-
-  for entry in data_downloads_str.split(','):
-    info = {}
-    if ';' in entry:
-      info['url'] = entry.split(';')[0]
-      info['local_path'] = os.path.join(root_data_dir, entry.split(';')[1])
-    else:
-      info['url'] = entry
-      info['local_path'] = os.path.join(root_data_dir, os.path.basename(entry))
-    # Canonicalize url to remove trailing '/' and '*'
-    if info['url'].endswith('*'):
-      info['url'] = info['url'][:-1]
-    if info['url'].endswith('/'):
-      info['url'] = info['url'][:-1]
-
-    download_infos.append(info)
-
-  return download_infos
-
-
-def maybe_upload_to_gcs(local_dir, output_gcs_url):
-  if not output_gcs_url:
-    return
-  run_commands(['gsutil -m cp -r {} {}'.format(local_dir, output_gcs_url)])
-  logging.info('Uploaded data from local directory %s to gcs %s',
-               local_dir, output_gcs_url)
-
-
-def make_dir_if_not_exist(local_path):
-  if not os.path.exists(local_path):
-    os.makedirs(local_path)
-    logging.info('Created directory %s', local_path)
-
-
-def run_command(cmd, shell=True):
-  """Structures for a variety of different test results.
-
-  Args:
-    cmd: Command to execute
-    shell: True to use shell, false otherwise.
-
-  Returns:
-    Tuple of the command return value and the standard out in as a string.
-  """
-  logging.debug('Executing command: %s', cmd)
-  p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
-                       stderr=subprocess.STDOUT, shell=shell)
-
-  exit_code = None
-  line = ''
-  stdout = ''
-  while exit_code is None or line:
-    exit_code = p.poll()
-    line = p.stdout.readline().decode('utf-8')
-    stdout += line
-    logging.debug(line)
-
-  return exit_code, stdout
-
-
-def run_commands(cmds, shell=True):
-  """Runs list of command and throw error if any fail."""
-  for cmd in cmds:
-    exit_code, stdout = run_command(cmd, shell=shell)
-    if exit_code:
-      raise Exception('"{}" failed with code:{} and stdout:\n{}'.format(
-          cmd, exit_code, stdout))
-
-
-def get_cpu_name():
-  cmd = "cat /proc/cpuinfo | grep 'model name' | sort --unique"
-  exit_code, result = run_command(cmd)
-  lines = result.splitlines()
-  if exit_code == 0 and lines:
-    model_name_parts = lines[0].split(':')
-    return model_name_parts[1].strip()
-  else:
-    logging.error('Error getting cpuinfo model name: %s', result)
-    return ''
-
-
-def get_cpu_socket_count():
-  cmd = 'grep -i "physical id" /proc/cpuinfo | sort -u | wc -l'
-  exit_code, result = run_command(cmd)
-  lines = result.splitlines()
-  if exit_code == 0 and lines:
-    return int(lines[0])
-  else:
-    logging.error('Error getting cpuinfo scocket count: %s', result)
-    return -1
-
-
-def _get_amd_gpu_info():
-  """Returns gpu information using rocm-smi.
-
-  Note: Assumes if the system has multiple GPUs, that they are all the same
-
-  Returns:
-    A dict containing gpu_driver_version, gpu_model and gpu_count or None if
-    `rocm-smi` is not found or fails.
-  """
-  cmd = 'rocm-smi --json --showproductname --showdriverversion'
-  exit_code, result = run_command(cmd)
-
-  if exit_code != 0:
-    logging.error('rocm-smi did not return as expected: %s', result)
-    return None
-
-  def get_gpu_driver_version(rocm_smi_output):
-    return rocm_smi_output['system']['Driver version']
-
-  def get_gpu_model(rocm_smi_output):
-    gpu_model = ""
-    for key, value in rocm_smi_output.items():
-      if re.match("card[0-9]+", key):
-        gpu_model = value['Card SKU']
-        break
-    return gpu_model
-
-  def get_gpu_count(rocm_smi_output):
-    gpu_count = 0
-    for key, value in rocm_smi_output.items():
-      if re.match("card[0-9]+", key):
-        gpu_count += 1
-    return gpu_count
-
-  rocm_smi_output= json.loads(result)
-
-  gpu_info = {}
-  gpu_info['gpu_driver_version'] = get_gpu_driver_version(rocm_smi_output)
-  gpu_info['gpu_model'] = get_gpu_model(rocm_smi_output)
-  gpu_info['gpu_count'] = get_gpu_count(rocm_smi_output)
-
-  return gpu_info
-
-
-def _get_nvidia_gpu_info():
-  """Returns gpu information using nvidia-smi.
-
-  Note: Assumes if the system has multiple GPUs that they are all the same with
-  one exception.  If the first result is a Quadro, the heuristic assumes
-  this may be a workstation and takes the second entry.
-
-  Returns:
-    A dict containing gpu_driver_version, gpu_model and gpu_count or None if
-    `nvidia-smi` is not found or fails.
-  """
-  cmd = 'nvidia-smi --query-gpu=driver_version,gpu_name --format=csv'
-  exit_code, result = run_command(cmd)
-
-  if exit_code != 0:
-    logging.error('nvidia-smi did not return as expected: %s', result)
-    return None
-
-  lines = result.splitlines()
-  gpu_info_line = lines[1]
-  if 'Quadro' in gpu_info_line and len(lines) >= 3:
-    gpu_info_line = lines[2]
-
-  gpu_info = {}
-  gpu_info['gpu_driver_version'] = gpu_info_line.split(',')[0].strip()
-  gpu_info['gpu_model'] = gpu_info_line.split(',')[1].strip()
-  gpu_info['gpu_count'] = len(lines) - 1
-
-  return gpu_info
-
-
-def get_gpu_info():
-  """Returns gpu information using either nvidia-smi or rocm-smi.
-
-  Returns:
-    A dict containing gpu_driver_version, gpu_model and gpu_count or None if
-    `nvidia-smi` is not found or fails.
-  """
-  return _get_amd_gpu_info() if shutil.which("rocm-smi") \
-    else _get_nvidia_gpu_info()
-
-
-def _install_tpu_tool():
-  """Installs the ctpu tool to managing cloud TPUs.
-
-  Follows the instructions here:
-  https://github.com/tensorflow/tpu/tree/master/tools/ctpu
-  """
-  if not os.path.exists('ctpu'):
-    logging.info('Installing TPU tool')
-    commands = [
-        'wget https://dl.google.com/cloud_tpu/ctpu/latest/linux/ctpu',
-        'chmod a+x ctpu',
-    ]
-    run_commands(commands)
-
-
-def setup_tpu(parameters):
-  """Sets up a TPU with a given set of parameters.
-
-  Args:
-    parameters: dictionary of TPU parameters.
-
-  Returns:
-    True if an error occurs during setup.
-  """
-  try:
-    _install_tpu_tool()
-
-    args = [
-        '--name={}'.format(parameters.get('name')),
-        '--project={}'.format(parameters.get('project')),
-        '--zone={}'.format(parameters.get('zone')),
-        '--tpu-size={}'.format(parameters.get('size')),
-        '--tf-version={}'.format(parameters.get('version')),
-        '--tpu-only',
-        '-noconf',
-    ]
-    command = './ctpu up {}'.format(' '.join(args))
-    logging.info('Setting up TPU: %s', command)
-    exit_code, output = run_command(command)
-    if exit_code != 0:
-      logging.error('Error in setup with output: %s', output)
-    return exit_code != 0
-  except Exception:
-    logging.error('Unable to setup TPU')
-    run_command('rm -f ctpu')
-    sys.exit(1)
-
-
-def cleanup_tpu(parameters):
-  """Cleans up an existing TPU.
-
-  Args:
-    parameters: dictionary of TPU parameters.
-
-  Returns:
-    True if an error occurs during cleanup.
-  """
-  _install_tpu_tool()
-
-  args = [
-      '--name={}'.format(parameters.get('name')),
-      '--project={}'.format(parameters.get('project')),
-      '--zone={}'.format(parameters.get('zone')),
-      '--tpu-only',
-      '-noconf',
-  ]
-  command = './ctpu delete {}'.format(' '.join(args))
-  logging.info('Cleaning up TPU: %s', command)
-  exit_code, output = run_command(command)
-  if exit_code != 0:
-    logging.error('Error in cleanup with output: %s', output)
-  return exit_code != 0
-
-
-def read_benchmark_result(benchmark_result_file_path):
-  """Read benchmark result from the protobuf file."""
-  from google.protobuf import json_format  # pylint: disable=g-import-not-at-top
-  from tensorflow.core.util import test_log_pb2  # pylint: disable=g-import-not-at-top
-
-  if not os.path.isfile(benchmark_result_file_path):
-    logging.error('Failed to read benchmark result because '
-                  'file %s does not exist', benchmark_result_file_path)
-    return {}
-
-  with open(benchmark_result_file_path, 'rb') as f:
-    benchmark_entries = test_log_pb2.BenchmarkEntries()
-    benchmark_entries.ParseFromString(f.read())
-
-    return json_format.MessageToDict(
-        benchmark_entries,
-        preserving_proto_field_name=True,
-        including_default_value_fields=True)['entry'][0]
-
-
-def print_thread_stacktrace():
-  print('Here is the stacktrace for all threads:')
-  thread_names = {t.ident: t.name for t in threading.enumerate()}
-  for thread_id, frame in sys._current_frames().items():  # pylint: disable=protected-access
-    print('Thread {}'.format(thread_names.get(thread_id, thread_id)))
-    traceback.print_stack(frame)
-
-
-def instantiate_benchmark_class(
-    benchmark_class, output_dir, root_data_dir, tpu, constructor_args,
-    benchmark_class_type=None):
-  """Return initialized benchmark class."""
-  module_import_path, class_name = benchmark_class.rsplit('.', 1)
-  module = importlib.import_module(module_import_path)
-  class_ = getattr(module, class_name)
-  if benchmark_class_type == 'tf_benchmark':
-    # for benchmarks inheriting from tf.test.Benchmark, instantiate them directly.
-    instance = class_(**constructor_args)
-  else:
-    # Default instantiation for perfzero_benchmark classes.
-    instance = class_(
-        output_dir=output_dir,
-        root_data_dir=root_data_dir,
-        tpu=tpu,
-        **constructor_args)
-
-  return instance
-
-
-def copy_and_rename_dirs(dir_spec_string, dst_base_dir):
-  """Copies list of <dir-path>:new_name specs into a new dest dir.
-
-  If a path /path1/path2/dir:new_dir is given, it copies /path1/path2/dir to
-  dst_base_dir/new_dir.
-
-  Args:
-    dir_spec_string: Comma separated list of /path1/path2:new_name specs.
-    dst_base_dir: The base dir to contain the copies.
-  """
-  if not dir_spec_string:
-    return
-  dir_specs = dir_spec_string.split(',')
-  for src_dir_with_name in dir_specs:
-    src_dir, final_basename = src_dir_with_name.split(':')
-    dst_dir = os.path.join(dst_base_dir, final_basename)
-
-    if os.path.isdir(dst_dir):
-      logging.info('[DELETE] pre-existing %s', dst_dir)
-      shutil.rmtree(dst_dir)
-    logging.info('[COPY] %s -> %s', src_dir, dst_dir)
-    shutil.copytree(src_dir, dst_dir)
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils_test.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils_test.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests utils.py."""
-
-import os
-import unittest
-from mock import call
-from mock import MagicMock
-from mock import patch
-import perfzero.utils as utils
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-
-class TestUtils(unittest.TestCase, tf.test.Benchmark):
-
-  def test_protobuf_read(self):
-    output_dir = '/tmp/'
-    os.environ['TEST_REPORT_FILE_PREFIX'] = output_dir
-    benchmark_result_file_path = os.path.join(output_dir,
-                                              'TestUtils.testReportBenchmark')
-    if os.path.exists(benchmark_result_file_path):
-      os.remove(benchmark_result_file_path)
-
-    self.report_benchmark(
-        iters=2000,
-        wall_time=1000,
-        name='testReportBenchmark',
-        metrics=[{'name': 'metric_name_1', 'value': 0, 'min_value': 1},
-                 {'name': 'metric_name_2', 'value': 90, 'min_value': 0,
-                  'max_value': 95}])
-
-    actual_result = utils.read_benchmark_result(
-        benchmark_result_file_path)
-    os.remove(benchmark_result_file_path)
-
-    expected_result = {
-        'name': 'TestUtils.testReportBenchmark',
-        # google.protobuf.json_format.MessageToDict() will convert
-        # int64 field to string.
-        'iters': '2000',
-        'wall_time': 1000,
-        'cpu_time': 0,
-        'throughput': 0,
-        'extras': {},
-        'metrics': [
-            {
-                'name': 'metric_name_1',
-                'value': 0,
-                'min_value': 1
-            },
-            {
-                'name': 'metric_name_2',
-                'value': 90,
-                'min_value': 0,
-                'max_value': 95
-            }
-        ]
-    }
-
-    self.assertDictEqual(expected_result, actual_result)
-
-  @patch('perfzero.utils.get_git_repo_info')
-  @patch('perfzero.utils.run_commands')
-  def test_checkout_git_repos(self, run_commands_mock, get_git_repo_info_mock):
-    git_repo_1 = {}
-    git_repo_1['url'] = 'url_1'
-    git_repo_1['local_path'] = 'local_path_1'
-    git_repo_1['dir_name'] = 'dir_name_1'
-    git_repo_1['branch'] = 'branch_1'
-    git_repo_1['git_hash'] = 'git_hash_1'
-
-    git_repo_2 = {}
-    git_repo_2['url'] = 'url_2'
-    git_repo_2['local_path'] = 'local_path_2'
-    git_repo_2['dir_name'] = 'dir_name_2'
-    git_repo_2['branch'] = 'branch_2'
-
-    git_repo_info_1 = {'url': 'url_1'}
-    git_repo_info_2 = {'url': 'url_2'}
-    get_git_repo_info_mock.side_effect = \
-        lambda local_path: git_repo_info_1 if local_path == 'local_path_1' else git_repo_info_2  # pylint: disable=line-too-long
-    site_package_info = utils.checkout_git_repos([git_repo_1, git_repo_2],
-                                                 False)
-
-    self.assertEqual(2, len(site_package_info))
-    self.assertEqual(git_repo_info_1, site_package_info['dir_name_1'])
-    self.assertEqual(git_repo_info_2, site_package_info['dir_name_2'])
-
-    run_commands_mock.assert_has_calls(any_order=False, calls=[
-        call(['git clone url_1 local_path_1']),
-        call(['git -C local_path_1 checkout branch_1']),
-        call(['git -C local_path_1 pull --rebase']),
-        call(['git -C local_path_1 reset --hard git_hash_1']),
-        call(['git clone url_2 local_path_2']),
-        call(['git -C local_path_2 checkout branch_2'])
-    ])
-
-  @patch('perfzero.utils.run_command')
-  def test_get_git_repo_info(self, run_command_mock):
-    run_command_mock.side_effect = [
-        [0, 'git_url'],
-        [0, 'branch_name'],
-        [0, 'git_hash']
-    ]
-
-    git_repo_info = utils.get_git_repo_info('local_path_1')
-    self.assertEqual(
-        {'url': 'git_url', 'branch': 'branch_name', 'hash': 'git_hash'},
-        git_repo_info)
-    run_command_mock.assert_has_calls(any_order=False, calls=[
-        call('git -C local_path_1 config --get remote.origin.url'),
-        call('git -C local_path_1 rev-parse --abbrev-ref HEAD'),
-        call('git -C local_path_1 rev-parse HEAD')
-    ])
-
-  @patch('builtins.open')
-  @patch('perfzero.utils.make_dir_if_not_exist')
-  @patch('requests.get')
-  @patch('perfzero.utils.run_commands')
-  def test_download_data(self, run_commands_mock, requests_get_mock,
-                         make_dir_mock, open_mock):  # pylint: disable=unused-argument
-    get_mock = MagicMock()
-    get_mock.content = 'content'
-    requests_get_mock.return_value = get_mock
-
-    download_info_1 = {'url': 'gs://remote_path_1/name_1',
-                       'local_path': 'local_path_1/modified_name_1'}
-    download_info_2 = {'url': 'http://remote_path_2/name_2',
-                       'local_path': 'local_path_2/modified_name_2'}
-    utils.download_data([download_info_1, download_info_2])
-
-    make_dir_mock.assert_has_calls(any_order=False, calls=[
-        call('local_path_1'),
-        call('local_path_2')
-    ])
-    requests_get_mock.assert_called_once_with('http://remote_path_2/name_2',
-                                              allow_redirects=True)
-    run_commands_mock.assert_has_calls(any_order=False, calls=[
-        call([['gsutil', '-m', 'cp', '-r', '-n',
-               'gs://remote_path_1/name_1', 'local_path_1']],
-             shell=False),
-        call(['mv local_path_1/name_1 local_path_1/modified_name_1']),
-        call(['mv local_path_2/name_2 local_path_2/modified_name_2'])
-    ])
-
-  def test_parse_data_downloads_str(self):
-    data_downloads_str = 'url_1;relative_path_1,url_2;relative_path_2'
-    download_infos = utils.parse_data_downloads_str('/root_data_dir',
-                                                    data_downloads_str)
-    self.assertEqual(2, len(download_infos))
-    self.assertEqual(download_infos[0],
-                     {'url': 'url_1',
-                      'local_path': '/root_data_dir/relative_path_1'})
-    self.assertEqual(download_infos[1],
-                     {'url': 'url_2',
-                      'local_path': '/root_data_dir/relative_path_2'})
-
-  @patch('perfzero.utils.run_command')
-  def test_get_cpu_name(self, run_command_mock):
-    """Tests extract the cpu model name."""
-    run_command_mock.return_value = [
-        0, 'model name  : Intel(R) Xeon(R) CPU E5-1650 v2 @ 3.50GHz\n'
-    ]
-    cpu_name = utils.get_cpu_name()
-    self.assertEqual('Intel(R) Xeon(R) CPU E5-1650 v2 @ 3.50GHz', cpu_name)
-
-  @patch('perfzero.utils.run_command')
-  def test_get_cpu_socket_count(self, run_command_mock):
-    """Tests get socket count."""
-    run_command_mock.return_value = [0, '2\n']
-    cpu_socket_count = utils.get_cpu_socket_count()
-    self.assertEqual(2, cpu_socket_count)
-
-  @patch('perfzero.utils.run_command')
-  def test_get_gpu_model(self, run_command_mock):
-    # Tests get gpu info parses expected value into expected components.
-    run_command_mock.return_value = [
-        0, 'driver_version, name\n381.99, GTX 1080 \n'
-    ]
-    gpu_model = utils.get_gpu_info()['gpu_model']
-    self.assertEqual('GTX 1080', gpu_model)
-
-    # Tests gpu info returns second entry if first entry is a Quadro.
-    run_command_mock.return_value = [
-        0, 'blah\n200.99, Quadro K900 \n381.99, GTX 1080\n'
-    ]
-    gpu_model = utils.get_gpu_info()['gpu_model']
-    self.assertEqual('GTX 1080', gpu_model)
-
-  @patch('perfzero.utils.run_command')
-  def test_get_gpu_count(self, run_command_mock):
-    """Tests gpu info returns second entry if first entry is a Quadro."""
-    run_command_mock.return_value = [
-        0, 'blah\n200.99, Quadro K900 \n381.99, GTX 1080\n'
-    ]
-    gpu_count = utils.get_gpu_info()['gpu_count']
-    self.assertEqual(2, gpu_count)
-
-
-
-
-
-
-
-
-
-
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/setup.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/lib/setup.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Checkout repository, download data and build docker image."""
-from __future__ import print_function
-
-import argparse
-import json
-import logging
-import os
-import shutil
-import sys
-import tempfile
-import time
-
-import perfzero.device_utils as device_utils
-import perfzero.perfzero_config as perfzero_config
-import perfzero.utils as utils
-
-
-def _temporary_file_name(parent_dir, base_name):
-  """Returns a temp name of the form <parent-dir>/<random>/<base-name>."""
-  if not os.path.isdir(parent_dir):
-    os.makedirs(parent_dir)
-  temp_dir = tempfile.mkdtemp(dir=parent_dir)
-  return os.path.join(temp_dir, base_name)
-
-
-def _load_docker_image(FLAGS, workspace_dir, setup_execution_time):
-  """Runs docker load --input_image <FLAGS.dockerfile_path>.
-
-  Fetches FLAGS.dockerfile_path to workspace_dir/<temp-dir>/local_docker first.
-  Runs docker load --input <path-to-local-docker>.
-  Deletes workspace_dir/<temp-dir> after the docker image is loaded.
-
-  Args:
-    FLAGS: parser.parse_known_args object.
-    workspace_dir: String - The path to use for intermediate artifacts.
-    setup_execution_time: Map from string->double containing wall times for
-      different operations. This will have insertions describing the docker
-      setup time.
-  """
-  load_docker_start_time = time.time()
-  local_docker_image_path = _temporary_file_name(workspace_dir, 'local_docker')
-  utils.download_data([{'url': FLAGS.dockerfile_path,
-                        'local_path': local_docker_image_path,
-                        'decompress': False}])
-
-  setup_execution_time['fetch_docker'] = time.time() - load_docker_start_time
-
-  docker_load_cmd = 'docker load --input {}'.format(local_docker_image_path)
-  try:
-    utils.run_commands(
-        [docker_load_cmd,
-         'docker images'  # Print loaded image list.
-        ])
-    setup_execution_time['load_docker'] = time.time() - load_docker_start_time
-  finally:
-    logging.info('removing parent dir of local docker image copy %s',
-                 local_docker_image_path)
-    shutil.rmtree(os.path.dirname(local_docker_image_path))
-
-
-def _create_docker_image(FLAGS, project_dir, workspace_dir,
-                         setup_execution_time):
-  """Creates a docker image.
-
-  Args:
-    FLAGS: parser.parse_known_args object.
-    project_dir: String - The current project path.
-    workspace_dir: String - The path to use for intermediate artifacts.
-    setup_execution_time: Map from string->double containing wall times for
-      different operations. This will have insertions describing the docker
-      setup time.
-  """
-  # Create docker image
-  docker_start_time = time.time()
-  docker_context = os.path.join(workspace_dir, 'resources')
-  # Necessary in case we don't have a local .whl file.
-  utils.create_empty_file(docker_context, 'EMPTY')
-
-  # Download TensorFlow pip package from Google Cloud Storage and modify package
-  # path accordingly, if applicable
-  local_tensorflow_pip_spec = None
-
-  if (FLAGS.tensorflow_pip_spec and
-      (FLAGS.tensorflow_pip_spec.startswith('gs://') or
-       FLAGS.tensorflow_pip_spec.startswith('file://'))):
-    local_pip_filename = os.path.basename(FLAGS.tensorflow_pip_spec)
-    local_pip_path = os.path.join(docker_context, local_pip_filename)
-    utils.download_data([{'url': FLAGS.tensorflow_pip_spec,
-                          'local_path': local_pip_path}])
-    # Update path to pip wheel file for the Dockerfile. Note that this path has
-    # to be relative to the docker context (absolute path will not work).
-    FLAGS.tensorflow_pip_spec = local_pip_filename
-    local_tensorflow_pip_spec = local_pip_filename
-  else:
-    local_tensorflow_pip_spec = 'EMPTY'
-
-  dockerfile_path = FLAGS.dockerfile_path
-  if not os.path.exists(dockerfile_path):
-    # Fall back to the deprecated approach if the user-specified
-    # dockerfile_path does not exist
-    dockerfile_path = os.path.join(project_dir, FLAGS.dockerfile_path)
-  extra_pip_specs = (FLAGS.extra_pip_specs or '').replace(';', '')
-  docker_base_cmd = 'docker build --no-cache --pull'
-  # FLAGS.extra_docker_build_args will be a list of strings (e.g. ['a', 'b=c']).
-  # We treat the strings directly as build-args: --build-arg a --build-arg b=c
-  # Empty strings are ignored.
-  extra_docker_build_args = ' '.join([
-      '--build-arg %s' % arg for arg in FLAGS.extra_docker_build_args if arg])
-  cmd = '{docker_base_cmd} -t {docker_tag}{tf_pip}{local_tf_pip}{extra_pip}{extra_docker_build_args} {suffix}'.format(
-      docker_base_cmd=docker_base_cmd,
-      docker_tag=FLAGS.docker_tag,
-      tf_pip=(
-          ' --build-arg tensorflow_pip_spec={}'.format(
-              FLAGS.tensorflow_pip_spec) if FLAGS.tensorflow_pip_spec else ''),
-      # local_tensorflow_pip_spec is either string 'EMPTY' or basename of
-      # local .whl file.
-      local_tf_pip=' --build-arg local_tensorflow_pip_spec={}'.format(
-          local_tensorflow_pip_spec),
-      extra_pip=' --build-arg extra_pip_specs=\'{}\''.format(extra_pip_specs),
-      extra_docker_build_args=' ' + extra_docker_build_args,
-      suffix=(
-          '-f {} {}'.format(dockerfile_path, docker_context)
-          if docker_context else '- < {}'.format(dockerfile_path))
-  )
-
-  utils.run_commands([cmd])
-  logging.info('Built docker image with tag %s', FLAGS.docker_tag)
-  setup_execution_time['build_docker'] = time.time() - docker_start_time
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser(
-      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-  perfzero_config.add_setup_parser_arguments(parser)
-  FLAGS, unparsed = parser.parse_known_args()
-
-  logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
-                      level=logging.DEBUG)
-  if unparsed:
-    logging.error('Arguments %s are not recognized', unparsed)
-    sys.exit(1)
-
-  setup_execution_time = {}
-  project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
-  workspace_dir = os.path.join(project_dir, FLAGS.workspace)
-  site_package_dir = os.path.join(workspace_dir, 'site-packages')
-  utils.copy_and_rename_dirs(FLAGS.site_package_downloads,
-                             site_package_dir)
-
-  activate_gcloud = False
-  if FLAGS.dockerfile_path and FLAGS.dockerfile_path.startswith('gs://'):
-    # We might end up doing gsutil fetch later, so need to call
-    # active_gcloud_service().
-    activate_gcloud = True
-
-  if FLAGS.tensorflow_pip_spec and FLAGS.tensorflow_pip_spec.startswith('gs://'):
-    activate_gcloud = True
-
-  # Download gcloud auth token. Remove this operation in the future when
-  # docker in Kokoro can accesss the GCP metadata server
-  start_time = time.time()
-  utils.active_gcloud_service(FLAGS.gcloud_key_file_url,
-                              workspace_dir, download_only=not activate_gcloud)
-  setup_execution_time['download_token'] = time.time() - start_time
-
-  # Set up the raid array.
-  start_time = time.time()
-  device_utils.create_drive_from_devices(FLAGS.root_data_dir,
-                                         FLAGS.gce_nvme_raid)
-  setup_execution_time['create_drive'] = time.time() - start_time
-
-  if FLAGS.dockerfile_path:
-    if FLAGS.dockerfile_path.endswith('.tar.gz'):
-      logging.info('Assuming given file %s is a docker image to load',
-                   FLAGS.dockerfile_path)
-      _load_docker_image(FLAGS, workspace_dir,
-                         setup_execution_time)
-    else:
-      _create_docker_image(FLAGS, project_dir, workspace_dir,
-                           setup_execution_time)
-
-  logging.info('Setup time in seconds by operation:\n %s',
-               json.dumps(setup_execution_time, indent=2))
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/screenshots/profiling_overview.png
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/screenshots/profiling_overview.png
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/screenshots/profiling_trace_view.png
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/screenshots/profiling_trace_view.png
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/scripts/create_big_table.txt
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/scripts/create_big_table.txt
-[
-  {
-    "name": "execution_timestamp",
-    "type": "TIMESTAMP",
-    "mode": "REQUIRED"
-  },
-  {
-    "name": "execution_id",
-    "type": "STRING",
-    "mode": "REQUIRED"
-  },
-
-  {
-    "name": "ml_framework_info",
-    "type": "STRING",
-    "mode": "NULLABLE"
-  },
-  {
-    "name": "benchmark_result",
-    "type": "STRING",
-    "mode": "NULLABLE"
-  },
-  {
-    "name": "benchmark_info",
-    "type": "STRING",
-    "mode": "NULLABLE"
-  },
-  {
-    "name": "setup_info",
-    "type": "STRING",
-    "mode": "NULLABLE"
-  },
-  {
-    "name": "system_info",
-    "type": "STRING",
-    "mode": "NULLABLE"
-  },
-  {
-    "name": "process_info",
-    "type": "STRING",
-    "mode": "NULLABLE"
-  }
-]
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/scripts/generate-readme-header.sh
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/scripts/generate-readme-header.sh
-#!/usr/bin/env bash
-
-#
-# Steps:
-#
-#  1. Download corresponding html file for some README.md:
-#       curl -s $1
-#
-#  2. Discard rows where no substring 'user-content-' (github's markup):
-#       awk '/user-content-/ { ...
-#
-#  3.1 Get last number in each row like ' ... </span></a>sitemap.js</h1'.
-#      It's a level of the current header:
-#       substr($0, length($0), 1)
-#
-#  3.2 Get level from 3.1 and insert corresponding number of spaces before '*':
-#       sprintf("%*s", substr($0, length($0), 1)*3, " ")
-#
-#  4. Find head's text and insert it inside "* [ ... ]":
-#       substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
-#
-#  5. Find anchor and insert it inside "(...)":
-#       substr($0, match($0, "href=\"[^\"]+?\" ")+6, RLENGTH-8)
-#
-
-gh_toc_version="0.6.0"
-
-gh_user_agent="gh-md-toc v$gh_toc_version"
-
-#
-# Download rendered into html README.md by its url.
-#
-#
-gh_toc_load() {
-    local gh_url=$1
-
-    if type curl &>/dev/null; then
-        curl --user-agent "$gh_user_agent" -s "$gh_url"
-    elif type wget &>/dev/null; then
-        wget --user-agent="$gh_user_agent" -qO- "$gh_url"
-    else
-        echo "Please, install 'curl' or 'wget' and try again."
-        exit 1
-    fi
-}
-
-#
-# Converts local md file into html by GitHub
-#
-# ➥ curl -X POST --data '{"text": "Hello world github/linguist#1 **cool**, and #1!"}' https://api.github.com/markdown
-# <p>Hello world github/linguist#1 <strong>cool</strong>, and #1!</p>'"
-gh_toc_md2html() {
-    local gh_file_md=$1
-    URL=https://api.github.com/markdown/raw
-    if [ -z "$GH_TOC_TOKEN" ]; then
-        TOKEN=$GH_TOC_TOKEN
-    else
-        TOKEN="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt"
-    fi
-    if [ -f "$TOKEN" ]; then
-        URL="$URL?access_token=$(cat $TOKEN)"
-    fi
-    # echo $URL 1>&2
-    OUTPUT="$(curl -s --user-agent "$gh_user_agent" \
-        --data-binary @"$gh_file_md" -H "Content-Type:text/plain" \
-        $URL)"
-
-    if [ "$?" != "0" ]; then
-        echo "XXNetworkErrorXX"
-    fi
-    if [ "$(echo "${OUTPUT}" | awk '/API rate limit exceeded/')" != "" ]; then
-        echo "XXRateLimitXX"
-    else
-        echo "${OUTPUT}"
-    fi
-}
-
-
-#
-# Is passed string url
-#
-gh_is_url() {
-    case $1 in
-        https* | http*)
-            echo "yes";;
-        *)
-            echo "no";;
-    esac
-}
-
-#
-# TOC generator
-#
-gh_toc(){
-    local gh_src=$1
-    local gh_src_copy=$1
-    local gh_ttl_docs=$2
-    local need_replace=$3
-
-    if [ "$gh_src" = "" ]; then
-        echo "Please, enter URL or local path for a README.md"
-        exit 1
-    fi
-
-
-    # Show "TOC" string only if working with one document
-    if [ "$gh_ttl_docs" = "1" ]; then
-
-        echo "Table of Contents"
-        echo "================="
-        echo ""
-        gh_src_copy=""
-
-    fi
-
-    if [ "$(gh_is_url "$gh_src")" == "yes" ]; then
-        gh_toc_load "$gh_src" | gh_toc_grab "$gh_src_copy"
-        if [ "${PIPESTATUS[0]}" != "0" ]; then
-            echo "Could not load remote document."
-            echo "Please check your url or network connectivity"
-            exit 1
-        fi
-        if [ "$need_replace" = "yes" ]; then
-            echo
-            echo "!! '$gh_src' is not a local file"
-            echo "!! Can't insert the TOC into it."
-            echo
-        fi
-    else
-        local rawhtml=$(gh_toc_md2html "$gh_src")
-        if [ "$rawhtml" == "XXNetworkErrorXX" ]; then
-             echo "Parsing local markdown file requires access to github API"
-             echo "Please make sure curl is installed and check your network connectivity"
-             exit 1
-        fi
-        if [ "$rawhtml" == "XXRateLimitXX" ]; then
-             echo "Parsing local markdown file requires access to github API"
-             echo "Error: You exceeded the hourly limit. See: https://developer.github.com/v3/#rate-limiting"
-             TOKEN="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/token.txt"
-             echo "or place github auth token here: $TOKEN"
-             exit 1
-        fi
-        local toc=`echo "$rawhtml" | gh_toc_grab "$gh_src_copy"`
-        echo "$toc"
-        if [ "$need_replace" = "yes" ]; then
-            local ts="<\!--ts-->"
-            local te="<\!--te-->"
-            local dt=`date +'%F_%H%M%S'`
-            local ext=".orig.${dt}"
-            local toc_path="${gh_src}.toc.${dt}"
-            local toc_footer="<!-- Added by: `whoami`, at: `date --iso-8601='minutes'` -->"
-            # http://fahdshariff.blogspot.ru/2012/12/sed-mutli-line-replacement-between-two.html
-            # clear old TOC
-            sed -i${ext} "/${ts}/,/${te}/{//!d;}" "$gh_src"
-            # create toc file
-            echo "${toc}" > "${toc_path}"
-            echo -e "\n${toc_footer}\n" >> "$toc_path"
-            # insert toc file
-            if [[ "`uname`" == "Darwin" ]]; then
-                sed -i "" "/${ts}/r ${toc_path}" "$gh_src"
-            else
-                sed -i "/${ts}/r ${toc_path}" "$gh_src"
-            fi
-            echo
-            echo "!! TOC was added into: '$gh_src'"
-            echo "!! Origin version of the file: '${gh_src}${ext}'"
-            echo "!! TOC added into a separate file: '${toc_path}'"
-            echo
-        fi
-    fi
-}
-
-#
-# Grabber of the TOC from rendered html
-#
-# $1 — a source url of document.
-# It's need if TOC is generated for multiple documents.
-#
-gh_toc_grab() {
-	# if closed <h[1-6]> is on the new line, then move it on the prev line
-	# for example:
-	# 	was: The command <code>foo1</code>
-	# 		 </h1>
-	# 	became: The command <code>foo1</code></h1>
-    sed -e ':a' -e 'N' -e '$!ba' -e 's/\n<\/h/<\/h/g' |
-    # find strings that corresponds to template
-    grep -E -o '<a.*id="user-content-[^"]*".*</h[1-6]' |
-    # remove code tags
-    sed 's/<code>//g' | sed 's/<\/code>//g' |
-    # now all rows are like:
-    #   <a id="user-content-..." href="..."><span ...></span></a> ... </h1
-    # format result line
-    #   * $0 — whole string
-    #   * last element of each row: "</hN" where N in (1,2,3,...)
-    echo -e "$(awk -v "gh_url=$1" '{
-    level = substr($0, length($0), 1)
-    text = substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
-    href = substr($0, match($0, "href=\"[^\"]+?\"")+6, RLENGTH-7)
-    print sprintf("%*s", level*3, " ") "* [" text "](" gh_url  href ")" }' |
-        sed 'y/+/ /; s/%/\\x/g')"
-}
-
-#
-# Returns filename only from full path or url
-#
-gh_toc_get_filename() {
-    echo "${1##*/}"
-}
-
-#
-# Options hendlers
-#
-gh_toc_app() {
-    local app_name=$(basename $0)
-    local need_replace="no"
-
-    if [ "$1" = '--help' ] || [ $# -eq 0 ] ; then
-        echo "GitHub TOC generator ($app_name): $gh_toc_version"
-        echo ""
-        echo "Usage:"
-        echo "  $app_name [--insert] src [src]  Create TOC for a README file (url or local path)"
-        echo "  $app_name -                     Create TOC for markdown from STDIN"
-        echo "  $app_name --help                Show help"
-        echo "  $app_name --version             Show version"
-        return
-    fi
-
-    if [ "$1" = '--version' ]; then
-        echo "$gh_toc_version"
-        echo
-        echo "os:     `lsb_release -d | cut -f 2`"
-        echo "kernel: `cat /proc/version`"
-        echo "shell:  `$SHELL --version`"
-        echo
-        for tool in curl wget grep awk sed; do
-            printf "%-5s: " $tool
-            echo `$tool --version | head -n 1`
-        done
-        return
-    fi
-
-    if [ "$1" = "-" ]; then
-        if [ -z "$TMPDIR" ]; then
-            TMPDIR="/tmp"
-        elif [ -n "$TMPDIR" -a ! -d "$TMPDIR" ]; then
-            mkdir -p "$TMPDIR"
-        fi
-        local gh_tmp_md
-        gh_tmp_md=$(mktemp $TMPDIR/tmp.XXXXXX)
-        while read input; do
-            echo "$input" >> "$gh_tmp_md"
-        done
-        gh_toc_md2html "$gh_tmp_md" | gh_toc_grab ""
-        return
-    fi
-
-    if [ "$1" = '--insert' ]; then
-        need_replace="yes"
-        shift
-    fi
-
-    for md in "$@"
-    do
-        echo ""
-        gh_toc "$md" "$#" "$need_replace"
-    done
-
-    echo ""
-    echo "Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)"
-}
-
-#
-# Entry point
-#
-gh_toc_app "$@"
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/scripts/plot_process_info.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/perfzero/scripts/plot_process_info.py
-#!/usr/bin/python
-#
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Plot graph showing process metric values over time"""
-
-from __future__ import print_function
-
-import argparse
-import sys
-import json
-import matplotlib.pyplot as plt
-import matplotlib.backends.backend_pdf as backend_pdf
-import matplotlib.ticker as tick
-
-colors=['b', 'r', 'g', 'c', 'pink']
-
-def visualize(file_path):
-
-  entries = []
-  with open(file_path) as f:
-    entries = [json.loads(line) for line in f.readlines() if line.strip()]
-
-  if not entries:
-    print('There is no data in file {}'.format(file_path))
-    return
-
-  pdf = backend_pdf.PdfPages("process_info.pdf")
-  idx = 0
-  names = [name for name in entries[0].keys() if name != 'time']
-  times = [entry['time'] for entry in entries]
-
-  for name in names:
-    values = [entry[name] for entry in entries]
-    fig = plt.figure()
-    ax = plt.gca()
-    ax.yaxis.set_major_formatter(tick.ScalarFormatter(useMathText=True))
-    plt.ticklabel_format(style='sci', axis='y', scilimits=(-2,3))
-    plt.plot(times, values, colors[idx % len(colors)], marker='x', label=name)
-    plt.xlabel('Time (sec)')
-    plt.ylabel(name)
-    plt.ylim(ymin=0)
-    plt.legend(loc = 'upper left')
-    pdf.savefig(fig)
-    idx += 1
-
-  plt.show()
-  pdf.close()
-  print('Generated process_info.pdf from {}'.format(file_path))
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser(usage='plot_process_info.py <path_to_file>' )
-  parser.add_argument('file_path', type=str)
-  flags = parser.parse_args(sys.argv[1:])
-
-
-  visualize(flags.file_path)
-
-
-
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/README.md
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/README.md
-# tf_cnn_benchmarks: High performance benchmarks
-
-**Note: tf_cnn_benchmarks is no longer maintained.**
-
-tf_cnn_benchmarks contains TensorFlow 1 implementations of several popular
-convolutional models, and is designed to be as fast as possible.
-tf_cnn_benchmarks supports both running on a single machine or running in
-distributed mode across multiple hosts.
-
-tf_cnn_benchmarks is no longer maintained. Although it will run with TensorFlow
-2, it was written and optimized for TensorFlow 1, and has not been maintained
-since TensorFlow 2 was released. For clean and easy-to-read TensorFlow 2 models,
-please see the [TensorFlow Official
-Models](https://github.com/tensorflow/models/tree/master/official).
-
-## Getting Started
-
-To run ResNet50 with synthetic data without distortions with a single GPU, run
-
-```
-python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
-```
-
-Note that the master branch of tf_cnn_benchmarks occasionally requires the
-latest nightly version of TensorFlow. You can install the nightly version by
-running `pip install tf-nightly-gpu` in a clean environment, or by installing
-TensorFlow from source. We sometimes will create a branch of tf_cnn_benchmarks,
-in the form of cnn_tf_vX.Y_compatible, that is compatible with TensorFlow
-version X.Y. For example, branch
-[cnn_tf_v1.9_compatible](https://github.com/tensorflow/benchmarks/tree/cnn_tf_v1.9_compatible/scripts/tf_cnn_benchmarks)
-works with TensorFlow 1.9. However, as tf_cnn_benchmarks is no longer
-maintained, we will likely no longer create new branches.
-
-Some important flags are
-
-*   model: Model to use, e.g. resnet50, inception3, vgg16, and alexnet.
-*   num_gpus: Number of GPUs to use.
-*   data_dir: Path to data to process. If not set, synthetic data is used. To
-    use Imagenet data use these
-    [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started)
-    as a starting point.
-*   batch_size: Batch size for each GPU.
-*   variable_update: The method for managing variables: parameter_server
-    ,replicated, distributed_replicated, independent
-*   local_parameter_device: Device to use as parameter server: cpu or gpu.
-
-To see the full list of flags, run `python tf_cnn_benchmarks.py --help`.
-
-To run ResNet50 with real data with 8 GPUs, run:
-
-```
-python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=256 \
--model=resnet50 --optimizer=momentum --variable_update=replicated \
--nodistortions --gradient_repacking=8 --num_gpus=8 \
--num_epochs=90 --weight_decay=1e-4 --data_dir=${DATA_DIR} --use_fp16 \
--train_dir=${CKPT_DIR}
-```
-This will train a ResNet-50 model on ImageNet with 2048 batch size on 8
-GPUs. The model should train to around 76% accuracy.
-
-## Running the tests
-
-To run the tests, run
-
-```bash
-pip install portpicker
-python run_tests.py && python run_tests.py --run_distributed_tests
-```
-
-Note the tests require portpicker.
-
-The command above runs a subset of tests that is both fast and fairly
-comprehensive. Alternatively, all the tests can be run, but this will take a
-long time:
-
-```bash
-python run_tests.py --full_tests && python run_tests.py --full_tests --run_distributed_tests
-```
-
-We will run all tests on every PR before merging them, so it is not necessary
-to pass `--full_tests` when running tests yourself.
-
-To run an individual test, such as method `testParameterServer` of test class
-`TfCnnBenchmarksTest` of module `benchmark_cnn_test`, run
-
-```bash
-python -m unittest -v benchmark_cnn_test.TfCnnBenchmarksTest.testParameterServer
-```
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Benchmarks the all-reduce algorithms of tf_cnn_benchmarks.
-
-tf_cnn_benchmarks uses all-reduce to aggregate gradients. This benchmark is
-useful for benchmarking the performance of just this gradient aggregation,
-instead of the entire model. All the flags that tf_cnn_benchmarks accepts are
-also accepted by this script, although many are silently ignored.
-
-The number and shapes of the tensors all-reduced are those of the variables of
-the model specified by the --model flag.
-TODO(reedwm): Allow custom sizes to be specified.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-import os
-import time
-
-from absl import app
-from absl import flags as absl_flags
-import tensorflow.compat.v1 as tf
-
-from tensorflow.python.ops import control_flow_ops
-import benchmark_cnn
-import cnn_util
-import flags
-from cnn_util import log_fn
-
-
-absl_flags.DEFINE_integer('iters_per_step', 5,
-                          'Number of iterations to run all-reduce for, per '
-                          'step. Every step, a session will be run on a Graph '
-                          'that contains this many copies of the all-reduce. '
-                          'The copies are run sequentially. Setting this above '
-                          '1 is useful to lower the overhead of starting the '
-                          'session run, running the VariableV2 ops at the '
-                          'start of the step, etc.')
-
-
-flags.define_flags()
-for name in flags.param_specs.keys():
-  absl_flags.declare_key_flag(name)
-
-
-def get_var_shapes(model):
-  """Returns the list of variable shapes for a tf_cnn_benchmarks Model."""
-  with tf.Graph().as_default():
-    # The variable shapes do not depend on the batch size.
-    images = tf.placeholder(tf.float32, model.get_input_shapes('train')[0])
-    model.build_network([images])
-    return [[int(d) for d in v.shape.dims] for v in tf.trainable_variables()]
-
-
-def all_reduce(all_device_tensors, variable_mgr):
-  """Performs a single batch all-reduce.
-
-  Args:
-    all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
-      a tensor, where t is the tower the tensor is on and i is the index of
-      the tensor.
-    variable_mgr: The VariableMgr to perform the all-reduce.
-  Returns:
-    List of list of tensors in the same form as `all_device_tensors`, except the
-    tensors are aggregated across towers.
-  """
-  tower_grads = [[(g, None) for g in device_tensors] for
-                 device_tensors in all_device_tensors]
-  _, aggregated_tower_grads = variable_mgr.preprocess_device_grads(tower_grads)
-  return [
-      [g for g, _ in agg_device_tensors]
-      for agg_device_tensors in aggregated_tower_grads]
-
-
-def build_all_reduce_iterations(all_device_tensors, tower_devices, variable_mgr,
-                                num_iters):
-  """Builds the all-reduce ops for multiple iterations to aggregate tensors.
-
-  The tensors in `all_device_tensors` are aggregated `num_iters` times. Each
-  iteration aggregates the results from the previous iteration. The iterations
-  are run sequentially, so the aggregations for an iteration do not start
-  running until the previous iteration has completed. Each iteration after the
-  first is aggregating already-aggregated values, but it does not matter because
-  we are only aggregating for benchmarking purposes.
-
-  Args:
-    all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
-      a tensor, where t is the tower the tensor is on and i is the index of
-      the tensor.
-    tower_devices: A list of device strings. tower_devices[t] is the device
-      of the tensors in all_device_tensors[t].
-    variable_mgr: The VariableMgr to perform the all-reduce.
-    num_iters: Number of iterations to aggregate tensors for.
-  Returns:
-    An op that when run, causes the all-reduce ops to run.
-  """
-  for i in range(num_iters):
-    with tf.name_scope('iteration_%d' % i):
-      # Step 1: Do the aggregation.
-      with tf.name_scope('tensor_aggregation'):
-        all_device_tensors = all_reduce(all_device_tensors, variable_mgr)
-
-      # Step 2. Create identity ops, to bring the aggregated results back to
-      # each device.
-      new_all_device_tensors = []
-      for device, device_tensors in zip(tower_devices, all_device_tensors):
-        with tf.device(device):
-          new_all_device_tensors.append([
-              tf.identity(t, name='identity_after_allreduce')
-              for t in device_tensors
-          ])
-      all_device_tensors = new_all_device_tensors
-
-      # Step 3. Add control dependencies to delay the next iteration until this
-      # iteration is complete. To avoid extra overhead, we do not have any
-      # cross-device control dependencies, which means it's possible for two
-      # iterations to slightly overlap.
-      new_all_device_tensors = []
-      for device_tensors in all_device_tensors:
-        new_all_device_tensors.append([
-            control_flow_ops.with_dependencies(
-                device_tensors, t, name='identity_after_dependencies')
-            for t in device_tensors
-        ])
-      all_device_tensors = new_all_device_tensors
-
-  # To prevent the dependency optimizer from removing every op we created,
-  # we store the results in variables.
-  ops_to_run = []
-  for device, device_tensors in zip(tower_devices, all_device_tensors):
-    with tf.device(device):
-      for t in device_tensors:
-        # The placeholder initial value is never run.
-        var = tf.Variable(tf.placeholder(tf.float32, t.shape), collections=[])
-        ops_to_run.append(var.assign(t))
-  return tf.group(*ops_to_run)
-
-
-def build_graph(tower_devices, tensor_shapes, variable_mgr, num_iters):
-  """Builds the graph for the benchmark.
-
-  Args:
-    tower_devices: A list of device strings of the devices to run the all-reduce
-      benchmark on.
-    tensor_shapes: A list of shapes of the tensors that will be aggregated for
-      the all-reduce.
-    variable_mgr: The VariableMgr to perform the all-reduce.
-    num_iters: Number of iterations to aggregate tensors for.
-  Returns:
-    An op that runs the benchmark.
-  """
-  all_device_tensors = []
-  for i, tower_device in enumerate(tower_devices):
-    with tf.device(tower_device):
-      device_tensors = []
-      for j, shape in enumerate(tensor_shapes):
-        tensor = tf.Variable(tf.random_normal(shape, dtype=tf.float32),
-                             name='tensor_%d_on_device_%d' % (j, i))
-        device_tensors.append(tensor)
-    all_device_tensors.append(device_tensors)
-
-  log_fn('Building all-reduce ops')
-  benchmark_op = build_all_reduce_iterations(all_device_tensors, tower_devices,
-                                             variable_mgr, num_iters)
-  log_fn('Done building all-reduce ops')
-  return benchmark_op
-
-
-def run_graph(benchmark_op, bench_cnn, init_ops, dummy_loss_op):
-  """Runs the graph for the benchmark.
-
-  Args:
-    benchmark_op: An op that runs the benchmark.
-    bench_cnn: The BenchmarkCNN where params and other attributes are obtained.
-    init_ops: A list of ops that are run before `benchmark_op` for
-      initialization.
-    dummy_loss_op: Any op. We must pass a loss op to
-      `benchmark_cnn.benchmark_one_step`, but the result of the op is never
-      actually used.
-  """
-  config = benchmark_cnn.create_config_proto(bench_cnn.params)
-  with tf.Session(config=config) as sess:
-    for op in init_ops:
-      sess.run(op)
-    step_train_times = []
-    fetches = {'average_loss': dummy_loss_op, 'benchmark_op': benchmark_op}
-    log_fn('Running warmup')
-    for i in range(-bench_cnn.num_warmup_batches, bench_cnn.num_batches):
-      if i == 0:
-        log_fn('Running all-reduce ops')
-        start = time.perf_counter()
-      if i > 0 and i % bench_cnn.params.display_every == 0:
-        log_fn('Iteration: %d. Average time per step so far: %s' %
-               (i, (time.perf_counter() - start) / i))
-      # Call benchmark_one_step instead of directly calling sess.run(...), to
-      # potentially get a trace file, partitioned graphs, etc.
-      benchmark_cnn.benchmark_one_step(
-          sess=sess,
-          fetches=fetches,
-          step=i,
-          # The batch size is only used for the images/sec calculation, which is
-          # not actually calculated because we pass show_images_per_sec=False.
-          batch_size=None,
-          step_train_times=step_train_times,
-          trace_filename=bench_cnn.trace_filename,
-          partitioned_graph_file_prefix=(
-              bench_cnn.params.partitioned_graph_file_prefix),
-          profiler=None,
-          image_producer=None,
-          params=bench_cnn.params,
-          show_images_per_sec=False)
-    log_fn('Average time per step: %s' %
-           ((time.perf_counter() - start) / bench_cnn.num_batches))
-
-
-def run_benchmark(bench_cnn, num_iters):
-  """Runs the all-reduce benchmark.
-
-  Args:
-    bench_cnn: The BenchmarkCNN where params, the variable manager, and other
-      attributes are obtained.
-    num_iters: Number of iterations to do all-reduce for for.
-
-  Raises:
-    ValueError: Invalid params of bench_cnn.
-  """
-  if bench_cnn.params.variable_update != 'replicated':
-    raise ValueError('--variable_update=replicated must be specified to use'
-                     'the all-reduce benchmark')
-  if bench_cnn.params.variable_consistency == 'relaxed':
-    raise ValueError('--variable_consistency=relaxed is not supported')
-
-  benchmark_op = build_graph(bench_cnn.raw_devices,
-                             get_var_shapes(bench_cnn.model),
-                             bench_cnn.variable_mgr, num_iters)
-  init_ops = [
-      tf.global_variables_initializer(),
-      bench_cnn.variable_mgr.get_post_init_ops()
-  ]
-  loss_op = tf.no_op()
-
-  if bench_cnn.graph_file:
-    path, filename = os.path.split(bench_cnn.graph_file)
-    as_text = filename.endswith('txt')
-    log_fn('Writing GraphDef as %s to %s' % (
-        'text' if as_text else 'binary', bench_cnn.graph_file))
-    tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
-                         path, filename, as_text)
-
-  run_graph(benchmark_op, bench_cnn, init_ops, loss_op)
-
-
-# TODO(reedwm): Reduce redundancy with tf_cnn_benchmarks
-def main(positional_arguments):
-  # Command-line arguments like '--distortions False' are equivalent to
-  # '--distortions=True False', where False is a positional argument. To prevent
-  # this from silently running with distortions, we do not allow positional
-  # arguments.
-  assert len(positional_arguments) >= 1
-  if len(positional_arguments) > 1:
-    raise ValueError('Received unknown positional arguments: %s'
-                     % positional_arguments[1:])
-
-  params = benchmark_cnn.make_params_from_flags()
-  params = benchmark_cnn.setup(params)
-  bench = benchmark_cnn.BenchmarkCNN(params)
-
-  tfversion = cnn_util.tensorflow_version_tuple()
-  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
-
-  run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
-
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  app.run(main)  # Raises error on invalid flags, unlike tf.app.run()
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/all_reduce_benchmark_test.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/all_reduce_benchmark_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for all_reduce_benchmark.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-
-import all_reduce_benchmark
-import benchmark_cnn
-import test_util
-
-
-class AllReduceBenchmarkTest(tf.test.TestCase):
-  """Tests the all-reduce benchmark."""
-
-  def _test_run_benchmark(self, params):
-    """Tests that run_benchmark() runs successfully with the params."""
-    logs = []
-    with test_util.monkey_patch(all_reduce_benchmark,
-                                log_fn=test_util.print_and_add_to_list(logs)):
-      bench_cnn = benchmark_cnn.BenchmarkCNN(params)
-      all_reduce_benchmark.run_benchmark(bench_cnn, num_iters=5)
-      self.assertRegex(logs[-1], '^Average time per step: [0-9.]+$')
-
-  def test_run_benchmark(self):
-    """Tests that run_benchmark() runs successfully."""
-    params = benchmark_cnn.make_params(num_batches=10,
-                                       variable_update='replicated',
-                                       num_gpus=2)
-    self._test_run_benchmark(params)
-    params = params._replace(hierarchical_copy=True, gradient_repacking=8,
-                             num_gpus=8)
-    self._test_run_benchmark(params)
-
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  tf.test.main()
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for allreduce."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections as pycoll
-import re
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow.compat.v1 as tf
-
-# pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
-try:
-  from tensorflow.python.distribute.v1 import all_reduce
-except ImportError:
-  # Compatibility with TF 2.4 and below
-  from tensorflow.python.distribute import all_reduce
-from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import collective_ops
-
-AllReduceSpecTuple = pycoll.namedtuple('AllReduceSpecTuple', 'alg shards limit')
-
-
-def parse_general_int(s):
-  """Parse integer with power-of-2 suffix eg. 32k."""
-  mo = re.match(r'(\d+)([KkMGT]?)$', s)
-  if mo:
-    i, suffix = mo.group(1, 2)
-    v = int(i)
-    if suffix:
-      if suffix == 'K' or suffix == 'k':
-        v *= 1024
-      elif suffix == 'M':
-        v *= (1024 * 1024)
-      elif suffix == 'G':
-        v *= (1024 * 1024 * 1024)
-      elif suffix == 'T':
-        v *= (1024 * 1024 * 1024 * 1024)
-      else:
-        raise ValueError('invalid integer string %s' % s)
-    return v
-  else:
-    v = int(s)
-  return v
-
-
-def parse_all_reduce_spec(all_reduce_spec):
-  """Parse all_reduce_spec.
-
-  Args:
-    all_reduce_spec: a string specifying a combination of all-reduce
-      algorithms to apply for gradient reduction.
-
-  Returns:
-    a list of AllReduceSpecTuple.
-
-  Raises:
-    ValueError: all_reduce_spec is not well-formed.
-
-  An all_reduce_spec has BNF form:
-     int ::= positive whole number
-     g_int ::= int[KkMGT]?
-     alg_spec ::= alg | alg#int
-     range_spec ::= alg_spec | alg_spec/alg_spec
-     spec ::= range_spec | range_spec:g_int:range_spec
-
-  Not all syntactically correct specifications are supported.
-  Examples of supported all_reduce_spec strings, with semantics explained:
-
-    'collective' == apply tf.collective_reduce operator to all tensors.
-    'collective#2' == apply tf.collective_reduce operator to all tensors,
-            requesting up to 2 simultaneous transfers at each node, if
-            feasible, by subdividing tensor by an additional factor of 2.
-    'xring' == apply ring all-reduce to all tensors
-    'xring#2' == apply ring all-reduce to all tensors, using two simultaneous
-            transfer rings, each operating on 1/2 of each tensor.
-    'nccl'  == apply NCCL all-reduce to all tensors (only works within
-            a single worker process where all devices are GPUs)
-    'nccl/xring' == apply NCCL all-reduce to all tensors within each worker
-            to produce at least one full-reduced (locally) value,
-            then apply ring all-reduce to one such value from each
-            worker, then apply NCCL broadcast to propagate those globally
-            reduced values back to every device within each worker.
-    'pscpu' == Shuffle reduce using worker CPUs as the gather devices: each
-            distributed tensor is reduced by copying all instances to
-            one of the worker CPUs, computing the reduction there, then
-            copying back to each participating device.  Tensor reductions
-            are assigned to specific CPUs round-robin.
-    'psgpu#4' == Arrange all GPUs across all workers into groups of 4.
-            Each distributed tensor is shuffle reduced against one
-            such group of 4 GPUs, selected round-robin.  That is, each
-            tensor is split across 4 shards for the reduction.
-    'pscpu:2k:pscpu#2:64k:xring' == Apply single-shard pscpu to
-            tensors of size <= 2048 elements, apply 2-shard pscpu to
-            tensors up to size 64k elements, apply xring to larger tensors.
-    'pscpu/pscpu#2' == Use shuffle gather to locally reduce each tensor on
-            the worker's CPU, then use 2-shard shuffle to reduce those
-            locally reduced tensors across workers (on the worker CPUs), then
-            scatter the globally reduced values locally from each worker CPU.
-  """
-  range_parts = all_reduce_spec.split(':') + ['-1']
-  if len(range_parts) % 2:
-    raise ValueError('all_reduce_spec not well formed: %s' % all_reduce_spec)
-  limit = 0
-  spec = []
-  alg = None
-  shards = 1
-  for i, range_part in enumerate(range_parts):
-    if i % 2 == 1:
-      try:
-        limit = parse_general_int(range_part)
-        spec.append(AllReduceSpecTuple(alg=alg, shards=shards, limit=limit))
-      except ValueError:
-        raise ValueError('all_reduce_spec (%s) contains non-integer range %s' %
-                         (all_reduce_spec, range_part))
-    else:
-      alg = range_part
-      alg_parts = range_part.split('#')
-      alg = alg_parts[0]
-      if len(alg_parts) > 1:
-        try:
-          shards = int(alg_parts[1])
-        except ValueError:
-          raise ValueError('all_reduce_spec (%s) contains non-integer '
-                           'shards %s' % all_reduce_spec, alg_parts[1])
-      else:
-        shards = 1
-      if alg not in [
-          'nccl', 'nccl/xring', 'nccl/rechd', 'nccl/pscpu', 'xring', 'pscpu',
-          'psgpu', 'pscpu/pscpu', 'collective'
-      ]:
-        raise ValueError('all_reduce_spec (%s) contains invalid alg %s' %
-                         (all_reduce_spec, alg))
-  return spec
-
-
-def build_all_reduce_device_prefixes(job_name, num_tasks):
-  """Build list of device prefix names for all_reduce.
-
-  Args:
-    job_name: 'worker', 'ps' or 'localhost'.
-    num_tasks: number of jobs across which device names should be generated.
-
-  Returns:
-     A list of device name prefix strings. Each element spells out the full
-     host name without adding the device.
-     e.g. '/job:worker/task:0'
-  """
-  if job_name != 'localhost':
-    return ['/job:%s/task:%d' % (job_name, d) for d in range(0, num_tasks)]
-  else:
-    assert num_tasks == 1
-    return ['/job:%s' % job_name]
-
-
-def group_device_names(devices, group_size):
-  """Group device names into groups of group_size.
-
-  Args:
-    devices: list of strings naming devices.
-    group_size: int >= 1
-
-  Returns:
-    list of lists of devices, where each inner list is group_size long,
-      and each device appears at least once in an inner list.  If
-      len(devices) % group_size = 0 then each device will appear
-      exactly once.
-
-  Raises:
-    ValueError: group_size > len(devices)
-  """
-  num_devices = len(devices)
-  if group_size > num_devices:
-    raise ValueError('only %d devices, but group_size=%d' % (num_devices,
-                                                             group_size))
-  num_groups = (
-      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
-  groups = [[] for i in range(num_groups)]
-  for i in range(0, num_groups * group_size):
-    groups[i % num_groups].append(devices[i % num_devices])
-  return groups
-
-
-def split_grads_by_size(threshold_size, device_grads):
-  """Break gradients into two sets according to tensor size.
-
-  Args:
-    threshold_size: int size cutoff for small vs large tensor.
-    device_grads: List of lists of (gradient, variable) tuples.  The outer
-        list is over devices. The inner list is over individual gradients.
-
-  Returns:
-    small_grads: Subset of device_grads where shape is <= theshold_size
-       elements.
-    large_grads: Subset of device_grads where shape is > threshold_size
-       elements.
-  """
-  small_grads = []
-  large_grads = []
-  for dl in device_grads:
-    small_dl = []
-    large_dl = []
-    for (g, v) in dl:
-      tensor_size = g.get_shape().num_elements()
-      if tensor_size <= threshold_size:
-        small_dl.append([g, v])
-      else:
-        large_dl.append([g, v])
-    if small_dl:
-      small_grads.append(small_dl)
-    if large_dl:
-      large_grads.append(large_dl)
-  return small_grads, large_grads
-
-
-_instance_key = 1
-
-
-def new_collective_instance_key():
-  """Returns a new instance key for use in defining a collective op."""
-  global _instance_key
-  v = _instance_key
-  _instance_key += 1
-  return v
-
-
-_group_key = 1
-_group_key_table = dict()
-
-
-def collective_group_key(devices):
-  """Returns a group key for the set of devices.
-
-  Args:
-    devices: list of strings naming devices in a collective group.
-
-  Returns:
-    int key uniquely identifying the set of device names.
-  """
-  global _group_key
-  global _group_key_table
-  parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
-  names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
-  concat = ','.join(names)
-  if concat not in _group_key_table.keys():
-    new_key = _group_key
-    _group_key += 1
-    _group_key_table[concat] = new_key
-  rv = _group_key_table[concat]
-  return rv
-
-
-def build_collective_reduce(input_tensors, num_workers, num_shards,
-                            red_op='Add', un_op='Id'):
-  """Build a subgraph that does one full all-reduce, using the collective Op.
-
-  Args:
-    input_tensors: tensors within a single worker graph that are to be reduced
-      together; must be one per device.
-    num_workers: total number of workers with identical independent graphs that
-      will be doing this same reduction.  The reduction will actually include
-      the corresponding tensors at all these workers.
-    num_shards: number of shards into which to divide each per-tick chunk,
-      normally 1 but could be higher on multi-data-path architectures.
-    red_op: string naming the reduction op
-    un_op: string naming the unary final op
-
-  Returns:
-    An array of final tensors, one per device, computed by the full reduction.
-
-  Raises:
-    ValueError: There must be at least two tensors over all the workers.
-  """
-  group_size = len(input_tensors) * num_workers
-  if group_size < 2:
-    raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
-  devices = [t.device for t in input_tensors]
-  num_devices = len(devices)
-  group_key = collective_group_key(devices)
-  instance_key = new_collective_instance_key()
-  out_tensors = []
-  if num_shards == 1:
-    subdiv_offsets = [0]
-  elif num_shards == 2:
-    if num_devices > 1:
-      subdiv_offsets = [0, -(num_devices // 2)]
-    else:
-      subdiv_offsets = [0]
-  else:
-    raise ValueError('Unsupported num_shards %d' % num_shards)
-  for d in range(num_devices):
-    with ops.device(devices[d]):
-      reduce_op = collective_ops.all_reduce(input_tensors[d],
-                                            group_size, group_key, instance_key,
-                                            red_op, un_op,
-                                            subdiv_offsets)
-      out_tensors.append(reduce_op)
-  return out_tensors
-
-
-def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
-  return collective_ops.broadcast_send(t, shape, dtype, group_size, group_key,
-                                       instance_key)
-
-
-def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
-  return collective_ops.broadcast_recv(shape, dtype, group_size, group_key,
-                                       instance_key)
-
-
-def sum_grad_and_var_all_reduce(single_session,
-                                grad_and_vars,
-                                num_workers,
-                                alg,
-                                gpu_indices,
-                                aux_devices=None,
-                                num_shards=1):
-  """Apply all-reduce algorithm over specified gradient tensors."""
-  scaled_grads = [g for g, _ in grad_and_vars]
-  if alg == 'collective':
-    assert not single_session
-    summed_grads = build_collective_reduce(
-        scaled_grads, num_workers, num_shards, 'Add', 'Id')
-  else:
-    with tf.name_scope('allreduce'):
-      # Note that each grad_and_vars looks like the following:
-      #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-      if alg == 'nccl':
-        summed_grads = all_reduce.build_nccl_all_reduce(scaled_grads, tf.add)
-      elif alg == 'xring':
-        summed_grads = all_reduce.build_ring_all_reduce(
-            scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
-      elif alg == 'nccl/xring':
-        summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
-                                                       tf.add)
-      elif alg == 'nccl/rechd':
-        summed_grads = all_reduce.build_nccl_then_recursive_hd(
-            scaled_grads, tf.add)
-      elif alg == 'nccl/pscpu':
-        summed_grads = all_reduce.build_nccl_then_shuffle(
-            scaled_grads, aux_devices, tf.add, tf.add_n)
-      elif alg == 'pscpu/pscpu':
-        summed_grads = all_reduce.build_shuffle_then_shuffle(
-            scaled_grads,
-            aux_devices,
-            # TODO(tucker): devise a way of better specifying the device set
-            # for the second level.
-            [aux_devices[0]],
-            tf.add_n)
-      elif alg in ['pscpu', 'psgpu']:
-        summed_grads = all_reduce.build_shuffle_all_reduce(
-            scaled_grads, aux_devices, tf.add_n)
-      else:
-        raise ValueError('unsupported all_reduce alg: ', alg)
-
-  result = []
-  for (_, v), g in zip(grad_and_vars, summed_grads):
-    result.append([g, v])
-  return result
-
-
-def contains_any(haystack, needles):
-  """Tests if any needle is a substring of haystack.
-
-  Args:
-    haystack: a string
-    needles: list of strings
-
-  Returns:
-    True if any element of needles is a substring of haystack,
-      False otherwise.
-  """
-  for n in needles:
-    if n in haystack:
-      return True
-  return False
-
-
-def sum_gradients_all_reduce(single_session,
-                             dev_prefixes,
-                             tower_grads,
-                             num_workers,
-                             alg,
-                             num_shards,
-                             gpu_indices,
-                             agg_small_grads_max_bytes=0,
-                             agg_small_grads_max_group=10,
-                             allreduce_merge_scope=1):
-  """Apply all-reduce algorithm over specified gradient tensors.
-
-  Args:
-    single_session: true if reduction is applied to one graph across
-      all workers, false if ths application is to a single-worker graph only.
-    dev_prefixes: list of prefix strings to use to generate PS device names.
-    tower_grads: the gradients to reduce.
-    num_workers: number of worker processes across entire job.
-    alg: the all-reduce algorithm to apply.
-    num_shards: alg-specific sharding factor.
-    gpu_indices: indices of local GPUs in order usable for ring-reduce.
-    agg_small_grads_max_bytes: largest tensor eligible for aggregation,
-      in number of bytes.
-    agg_small_grads_max_group: largest permitted aggregation of small
-      tensors.
-    allreduce_merge_scope: size of groups into which to partition consecutive
-      gradients grouped under a common 'allreduce' name scope for application
-      of ScopedAllocator optimization.
-
-  Returns:
-    list of reduced tensors
-  """
-  alg_contains_shuffle = contains_any(alg, ['pscpu', 'psgpu'])
-  is_hierarchical = '/' in alg
-  if 'pscpu' in alg:
-    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
-  elif 'psgpu' in alg:
-    aux_devices = [
-        prefix + '/gpu:%d' % i
-        for i in range(len(gpu_indices))
-        for prefix in dev_prefixes
-    ]
-  else:
-    aux_devices = ['/job:localhost/cpu:0']
-  aux_device_groups = group_device_names(
-      aux_devices,
-      num_shards if (alg != 'collective' and alg_contains_shuffle) else 1)
-  group_index = 0
-  if agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
-    tower_grads, packing = pack_small_tensors(
-        tower_grads,
-        max_bytes=agg_small_grads_max_bytes,
-        max_group=agg_small_grads_max_group)
-  else:
-    packing = None
-  reduced_gv_list = []
-  gv = list(zip(*tower_grads))
-  merge_scope = allreduce_merge_scope if allreduce_merge_scope > 0 else 1
-  chunked_gv = [gv[x:x + merge_scope]
-                for x in xrange(0, len(gv), merge_scope)]
-  for chunk in chunked_gv:
-    with tf.name_scope('allreduce'):
-      for grad_and_vars in chunk:
-        reduced_gv_list.append(sum_grad_and_var_all_reduce(
-            single_session,
-            grad_and_vars, num_workers, alg, gpu_indices,
-            (aux_devices if is_hierarchical
-             else aux_device_groups[group_index]),
-            num_shards))
-        group_index = (group_index + 1) % len(aux_device_groups)
-  new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
-  if packing:
-    new_tower_grads = unpack_small_tensors(new_tower_grads, packing)
-  return new_tower_grads
-
-
-def extract_ranges(index_list, range_size_limit=32):
-  """Extract consecutive ranges and singles from index_list.
-
-  Args:
-    index_list: List of monotone increasing non-negative integers.
-    range_size_limit: Largest size range to return.  If a larger
-      consecutive range exists it will be returned as multiple
-      ranges.
-
-  Returns:
-   ranges, singles where ranges is a list of [first, last] pairs of
-     consecutive elements in index_list, and singles is all of the
-     other elements, in original order.
-  """
-  if not index_list:
-    return [], []
-  first = index_list[0]
-  last = first
-  ranges = []
-  singles = []
-  for i in index_list[1:]:
-    if i == last + 1 and (last - first) <= range_size_limit:
-      last = i
-    else:
-      if last > first:
-        ranges.append([first, last])
-      else:
-        singles.append(first)
-      first = i
-      last = i
-  if last > first:
-    ranges.append([first, last])
-  else:
-    singles.append(first)
-  return ranges, singles
-
-
-GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
-
-
-def pack_range(key, packing, grad_vars, rng):
-  """Form the concatenation of a specified range of gradient tensors.
-
-  Args:
-    key: Value under which to store meta-data in packing that will be used
-      later to restore the grad_var list structure.
-    packing: Dict holding data describing packed ranges of small tensors.
-    grad_vars: List of (grad, var) pairs for one tower.
-    rng: A pair of integers giving the first, last indices of a consecutive
-      range of tensors to be packed.
-
-  Returns:
-    A tensor that is the concatenation of all the specified small tensors.
-  """
-  to_pack = grad_vars[rng[0]:rng[1] + 1]
-  members = []
-  variables = []
-  restore_shapes = []
-  with tf.name_scope('pack'):
-    for g, v in to_pack:
-      variables.append(v)
-      restore_shapes.append(g.shape)
-      with tf.device(g.device):
-        members.append(tf.reshape(g, [-1]))
-    packing[key] = GradPackTuple(
-        indices=range(rng[0], rng[1] + 1),
-        vars=variables,
-        shapes=restore_shapes)
-    with tf.device(members[0].device):
-      return tf.concat(members, 0)
-
-
-def unpack_grad_tuple(gv, gpt):
-  """Unpack a previously packed collection of gradient tensors.
-
-  Args:
-    gv: A (grad, var) pair to be unpacked.
-    gpt: A GradPackTuple describing the packing operation that produced gv.
-
-  Returns:
-    A list of (grad, var) pairs corresponding to the values that were
-     originally packed into gv, maybe following subsequent operations like
-     reduction.
-  """
-  elt_widths = [x.num_elements() for x in gpt.shapes]
-  with tf.device(gv[0][0].device):
-    with tf.name_scope('unpack'):
-      splits = tf.split(gv[0], elt_widths)
-      unpacked_gv = []
-      for idx, s in enumerate(splits):
-        unpacked_gv.append((tf.reshape(s, gpt.shapes[idx]), gpt.vars[idx]))
-  return unpacked_gv
-
-
-def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
-  """Concatenate small gradient tensors together for reduction.
-
-  Args:
-    tower_grads: List of lists of (gradient, variable) tuples.
-    max_bytes: Int giving max number of bytes in a tensor that
-      may be considered small.
-    max_group: Int giving max number of small tensors that may be
-      concatenated into one new tensor.
-
-  Returns:
-    new_tower_grads, packing where new_tower_grads is identical to
-      tower_grads except that all feasible small_tensors have been removed
-      from their places and concatenated into larger tensors that are
-      now in the front of the list for each tower, and packing contains
-      the data necessary to restore the tower_grads structure.
-
-  Look through the first tower for gradients of the same type (float),
-  and small size, that are all sequential.  For each such group,
-  replace by a new tensor that is a flattened concatenation.  Note
-  that the corresponding variable will be absent, which doesn't matter
-  because it isn't used during all-reduce.
-
-  Requires:
-    Every gv_list in towers must have isomorphic structure including identical
-      tensor sizes and types.
-  """
-  small_indices = []
-  large_indices = []
-  for idx, (g, _) in enumerate(tower_grads[0]):
-    if g.dtype == tf.float32 and (4 * g.shape.num_elements()) <= max_bytes:
-      small_indices.append(idx)
-    else:
-      large_indices.append(idx)
-  small_ranges, small_singles = extract_ranges(
-      small_indices, range_size_limit=max_group)
-  large_indices = sorted(large_indices + small_singles)
-  num_gv = len(tower_grads[0])
-  packing = {}
-  if small_ranges:
-    new_tower_grads = []
-    for dev_idx, gv_list in enumerate(tower_grads):
-      assert len(gv_list) == num_gv
-      new_gv_list = []
-      for r in small_ranges:
-        key = '%d:%d' % (dev_idx, len(new_gv_list))
-        new_gv_list.append((pack_range(key, packing, gv_list, r),
-                            'packing_var_placeholder'))
-      for i in large_indices:
-        new_gv_list.append(gv_list[i])
-      new_tower_grads.append(new_gv_list)
-    return new_tower_grads, packing
-  else:
-    return tower_grads, None
-
-
-def unpack_small_tensors(tower_grads, packing):
-  """Undo the structure alterations to tower_grads done by pack_small_tensors.
-
-  Args:
-    tower_grads: List of List of (grad, var) tuples.
-    packing: A dict generated by pack_small_tensors describing the changes
-      it made to tower_grads.
-
-  Returns:
-    new_tower_grads: identical to tower_grads except that concatentations
-      of small tensors have been split apart and returned to their original
-      positions, paired with their original variables.
-  """
-  if not packing:
-    return tower_grads
-  new_tower_grads = []
-  num_devices = len(tower_grads)
-  num_packed = len(packing.keys()) // num_devices
-  for dev_idx, gv_list in enumerate(tower_grads):
-    new_gv_list = gv_list[num_packed:]
-    for i in xrange(0, num_packed):
-      k = '%d:%d' % (dev_idx, i)
-      gpt = packing[k]
-      gv = unpack_grad_tuple(gv_list[i], gpt)
-      for gi, idx in enumerate(gpt.indices):
-        assert idx == gpt.indices[gi]
-        new_gv_list.insert(idx, gv[gi])
-    new_tower_grads.append(new_gv_list)
-  return new_tower_grads
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce_test.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce_test.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for tf_cnn_benchmark.allreduce."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections as pycoll
-
-import numpy as np
-import tensorflow.compat.v1 as tf
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-import allreduce
-
-
-class AllReduceTest(tf.test.TestCase):
-
-  def testGroupKey(self):
-    d0 = ['/job:worker/replica:0/task:0/device:GPU:1',
-          '/job:worker/replica:0/task:0/device:GPU:0',
-          '/job:worker/replica:0/task:0/device:GPU:3',]
-    d1 = ['/job:worker/replica:0/task:1/device:GPU:1',
-          '/job:worker/replica:0/task:1/device:GPU:0',
-          '/job:worker/replica:0/task:1/device:GPU:3',]
-    d2 = ['/job:worker/replica:0/task:1/device:GPU:1',
-          '/job:worker/replica:0/task:1/device:GPU:3',
-          '/job:worker/replica:0/task:1/device:GPU:0',]
-    d3 = ['/job:worker/replica:0/task:1/device:GPU:1',
-          '/job:worker/replica:0/task:1/device:GPU:3',
-          '/job:worker/replica:0/task:1/device:GPU:2',]
-    d4 = ['/job:worker/task:0/device:GPU:1',
-          '/job:worker/task:0/device:GPU:2',
-          '/job:worker/task:0/device:GPU:3',]
-    d5 = ['/job:worker/task:0/device:CPU:1',
-          '/job:worker/task:0/device:CPU:2']
-    d6 = ['/job:worker/task:0/device:CPU:2',
-          '/job:worker/task:0/device:CPU:1']
-    g0 = allreduce.collective_group_key(d0)
-    g1 = allreduce.collective_group_key(d1)
-    g2 = allreduce.collective_group_key(d2)
-    g3 = allreduce.collective_group_key(d3)
-    g4 = allreduce.collective_group_key(d4)
-    g5 = allreduce.collective_group_key(d5)
-    g6 = allreduce.collective_group_key(d6)
-    self.assertEqual(g0, g1)
-    self.assertEqual(g0, g2)
-    self.assertNotEqual(g0, g3)
-    self.assertEqual(g3, g4)
-    self.assertEqual(g5, g6)
-    self.assertNotEqual(g4, g5)
-
-  def testExtractRanges(self):
-    x = []
-    expected_ranges = []
-    expected_singles = []
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-    x = [1, 3, 4, 6, 7, 8, 9]
-    expected_ranges = [[3, 4], [6, 9]]
-    expected_singles = [1]
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-    x = [1, 2, 3, 4, 6, 7, 8, 9]
-    expected_ranges = [[1, 4], [6, 9]]
-    expected_singles = []
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-    x = [1, 3, 4, 6, 7, 9]
-    expected_ranges = [[3, 4], [6, 7]]
-    expected_singles = [1, 9]
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-    x = [1, 3, 6, 9]
-    expected_ranges = []
-    expected_singles = [1, 3, 6, 9]
-    ranges, singles = allreduce.extract_ranges(x)
-    self.assertEqual(expected_ranges, ranges)
-    self.assertEqual(expected_singles, singles)
-
-  def testPackRange(self):
-    packing = {}
-    t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
-    t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
-
-    gv = [(t0, 'v0'), (t1, 'v1')]
-    new_t = allreduce.pack_range('0:0', packing, gv, [0, 1])
-    self.assertEqual(1, new_t.shape.ndims)
-    self.assertEqual(8, new_t.shape.dims[0])
-    self.assertEqual(
-        packing, {
-            '0:0':
-                allreduce.GradPackTuple(
-                    indices=range(2),
-                    vars=['v0', 'v1'],
-                    shapes=[tf.TensorShape([4]),
-                            tf.TensorShape([4])])
-        })
-
-    t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
-    t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
-    gv = [(t0, 'v0'), (t1, 'v1'), (t2, 'v2'), (t3, 'v3')]
-    packing = {}
-    new_t = allreduce.pack_range('1:0', packing, gv, [0, 3])
-    self.assertEqual(1, new_t.shape.ndims)
-    self.assertEqual(26, new_t.shape.dims[0])
-    self.assertEqual(
-        packing, {
-            '1:0':
-                allreduce.GradPackTuple(
-                    indices=range(4),
-                    vars=['v0', 'v1', 'v2', 'v3'],
-                    shapes=[
-                        tf.TensorShape([4]),
-                        tf.TensorShape([4]),
-                        tf.TensorShape([3, 3]),
-                        tf.TensorShape([3, 3])
-                    ])
-        })
-
-  def testUnpackGradTuple(self):
-    packing = {
-        '0:0':
-            allreduce.GradPackTuple(
-                indices=range(4),
-                vars=['v0', 'v1', 'v2', 'v3'],
-                shapes=[
-                    tf.TensorShape([4]),
-                    tf.TensorShape([4]),
-                    tf.TensorShape([3, 3]),
-                    tf.TensorShape([3, 3])
-                ])
-    }
-    tc = tf.constant([0, 1, 2, 3, 4, 5, 6, 7,
-                      0, 1, 2, 3, 4, 5, 6, 7, 8,
-                      0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
-    packed_gv = [tc, 'packing_var_placeholder']
-    gv = allreduce.unpack_grad_tuple(packed_gv, packing['0:0'])
-    self.assertLen(gv, 4)
-    self.assertEqual('v0', gv[0][1])
-    self.assertEqual('v1', gv[1][1])
-    self.assertEqual('v2', gv[2][1])
-    self.assertEqual('v3', gv[3][1])
-    self.assertEqual(1, gv[0][0].shape.ndims)
-    self.assertEqual(4, gv[0][0].shape.dims[0])
-    self.assertEqual(1, gv[1][0].shape.ndims)
-    self.assertEqual(4, gv[1][0].shape.dims[0])
-    self.assertEqual(2, gv[2][0].shape.ndims)
-    self.assertEqual(3, gv[2][0].shape.dims[0])
-    self.assertEqual(3, gv[2][0].shape.dims[1])
-
-  def testPackSmallTensors(self):
-    t0 = tf.constant([0, 1, 2, 3], dtype=tf.float32)
-    t1 = tf.constant([4, 5, 6, 7], dtype=tf.float32)
-    t2 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
-    t3 = tf.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=tf.float32)
-    tower_grads = []
-    for d in range(0, 3):
-      gv = [(t0, 'v_%d_0' % d), (t1, 'v_%d_1' %d), (t2, 'v_%d_2' %d),
-            (t3, 'v_%d_3' % d)]
-      tower_grads.append(gv)
-
-    # 1) Set the size limit so small that nothing gets concatenated.
-    new_tower_grads, packing = allreduce.pack_small_tensors(
-        tower_grads, max_bytes=12,
-        max_group=10)
-    self.assertEqual(tower_grads, new_tower_grads)
-    self.assertIs(packing, None)
-
-    # 2) Set the size limit so only the first two tensors get concatenated
-    new_tower_grads, packing = allreduce.pack_small_tensors(
-        tower_grads, max_bytes=16,  # 16 bytes == 4 elements
-        max_group=10)
-    self.assertLen(new_tower_grads, 3)
-    self.assertLen(tower_grads[0], 4)
-    first_tower = new_tower_grads[0]
-    self.assertLen(first_tower, 3)
-    self.assertEqual(1, first_tower[0][0].shape.ndims)
-    self.assertEqual(8, first_tower[0][0].shape.dims[0])
-    self.assertEqual(packing,
-                     {'0:0': allreduce.GradPackTuple(
-                         indices=range(2),
-                         vars=['v_0_0', 'v_0_1'],
-                         shapes=[tf.TensorShape([4]),
-                                 tf.TensorShape([4])]),
-                      '1:0': allreduce.GradPackTuple(
-                          indices=range(2),
-                          vars=['v_1_0', 'v_1_1'],
-                          shapes=[tf.TensorShape([4]),
-                                  tf.TensorShape([4])]),
-                      '2:0': allreduce.GradPackTuple(
-                          indices=range(2),
-                          vars=['v_2_0', 'v_2_1'],
-                          shapes=[tf.TensorShape([4]),
-                                  tf.TensorShape([4])])})
-
-    # 3) Set the size limit so all tensors get concatenated
-    new_tower_grads, packing = allreduce.pack_small_tensors(
-        tower_grads, max_bytes=256,   # bytes = 64 elements
-        max_group=10)
-    self.assertLen(new_tower_grads, 3)
-    self.assertLen(tower_grads[0], 4)
-    self.assertLen(new_tower_grads[0], 1)
-    first_tower = new_tower_grads[0]
-    self.assertEqual(1, first_tower[0][0].shape.ndims)
-    self.assertEqual(26, first_tower[0][0].shape.dims[0])
-    self.assertEqual(packing,
-                     {'0:0': allreduce.GradPackTuple(
-                         indices=range(4),
-                         vars=['v_0_0', 'v_0_1', 'v_0_2', 'v_0_3'],
-                         shapes=[tf.TensorShape([4]),
-                                 tf.TensorShape([4]),
-                                 tf.TensorShape([3, 3,]),
-                                 tf.TensorShape([3, 3,])]),
-                      '1:0': allreduce.GradPackTuple(
-                          indices=range(4),
-                          vars=['v_1_0', 'v_1_1', 'v_1_2', 'v_1_3'],
-                          shapes=[tf.TensorShape([4]),
-                                  tf.TensorShape([4]),
-                                  tf.TensorShape([3, 3,]),
-                                  tf.TensorShape([3, 3,])]),
-                      '2:0': allreduce.GradPackTuple(
-                          indices=range(4),
-                          vars=['v_2_0', 'v_2_1', 'v_2_2', 'v_2_3'],
-                          shapes=[tf.TensorShape([4]),
-                                  tf.TensorShape([4]),
-                                  tf.TensorShape([3, 3,]),
-                                  tf.TensorShape([3, 3,])])})
-
-  def testUnpackSmallTensors(self):
-    packing = {'0:0': allreduce.GradPackTuple(indices=range(2),
-                                              vars=['v_0_0', 'v_0_1'],
-                                              shapes=[tf.TensorShape([4]),
-                                                      tf.TensorShape([4])]),
-               '0:1': allreduce.GradPackTuple(indices=range(3, 5),
-                                              vars=['v_0_3', 'v_0_4'],
-                                              shapes=[tf.TensorShape([3, 3,]),
-                                                      tf.TensorShape([3, 3,])]),
-               '1:0': allreduce.GradPackTuple(indices=range(2),
-                                              vars=['v_1_0', 'v_1_1'],
-                                              shapes=[tf.TensorShape([4]),
-                                                      tf.TensorShape([4])]),
-               '1:1': allreduce.GradPackTuple(indices=range(3, 5),
-                                              vars=['v_1_3', 'v_1_4'],
-                                              shapes=[tf.TensorShape([3, 3,]),
-                                                      tf.TensorShape([3, 3,])])}
-    t0 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7], dtype=tf.float32)
-    t1 = tf.constant([17, 17], dtype=tf.float32)
-    t2 = tf.constant([0, 1, 2, 3, 4, 5, 6, 7, 8,
-                      0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=tf.float32)
-    t3 = tf.constant([0], dtype=tf.float32)
-    tower_grads = []
-    for d in range(0, 2):
-      one_tower = [(t0, 'packing_var_placeholder'),
-                   (t2, 'packing_var_placeholder'),
-                   (t1, 'v_%d_2' % d), (t3, 'v_%d_5' %d)]
-      tower_grads.append(one_tower)
-    new_tower_grads = allreduce.unpack_small_tensors(tower_grads, packing)
-    self.assertLen(new_tower_grads, 2)
-    for d, tg in enumerate(new_tower_grads):
-      self.assertLen(tg, 6)
-      self.assertEqual('v_%d_0' % d, tg[0][1])
-      self.assertEqual('v_%d_1' % d, tg[1][1])
-      self.assertEqual('v_%d_2' % d, tg[2][1])
-      self.assertEqual('v_%d_3' % d, tg[3][1])
-      self.assertEqual('v_%d_4' % d, tg[4][1])
-      self.assertEqual('v_%d_5' % d, tg[5][1])
-      self.assertEqual(1, tg[0][0].shape.ndims)
-      self.assertEqual(4, tg[0][0].shape.dims[0])
-      self.assertEqual(1, tg[1][0].shape.ndims)
-      self.assertEqual(4, tg[1][0].shape.dims[0])
-      self.assertEqual(1, tg[2][0].shape.ndims)
-      self.assertEqual(2, tg[2][0].shape.dims[0])
-      self.assertEqual(2, tg[3][0].shape.ndims)
-      self.assertEqual(3, tg[3][0].shape.dims[0])
-      self.assertEqual(3, tg[3][0].shape.dims[1])
-      self.assertEqual(2, tg[4][0].shape.ndims)
-      self.assertEqual(3, tg[4][0].shape.dims[0])
-      self.assertEqual(3, tg[4][0].shape.dims[1])
-      self.assertEqual(1, tg[5][0].shape.ndims)
-      self.assertEqual(1, tg[5][0].shape.dims[0])
-
-
-class DynamicPackingTest(test_util.TensorFlowTestCase):
-  """Packing/Unpacking tests that require executing a TensorFlow session."""
-
-  def _init_tensors(self, num_towers, tensor_shapes):
-    """Construct a collection of tensors across multiple devices."""
-    num_tensors = len(tensor_shapes)
-    consts = []
-    tensors = []
-    vrbls = []
-    tower_grads = []
-    tf.Variable([-1], dtype=tf.int32, name='packing_var_placeholder')
-    for dev_idx in range(0, num_towers):
-      devname = '/job:localhost/device:GPU:%d' % dev_idx
-      consts.append([])
-      tensors.append([])
-      vrbls.append([])
-      with tf.device(devname):
-        base_value = 0
-        gv_tuples = []
-        for t_idx in range(0, num_tensors):
-          shape = tensor_shapes[t_idx]
-          num_elts = 0
-          for d in shape:
-            num_elts = (num_elts or 1) * d
-          c = np.fromiter(range(base_value, base_value + num_elts),
-                          dtype=np.float32).reshape(shape)
-          base_value += num_elts
-          consts[dev_idx].append(c)
-          tensors[dev_idx].append(tf.constant(c))
-          vrbls[dev_idx].append(
-              tf.Variable(c, name='v_d%d_t%d' % (dev_idx, t_idx)))
-          gv_tuples.append((tensors[dev_idx][-1], vrbls[dev_idx][-1]))
-        tower_grads.append(gv_tuples)
-    return tower_grads, consts, tensors, vrbls
-
-  _test_tuple = pycoll.namedtuple('_test_tuple',
-                                  'num_devices, in_shapes out_shapes out_i')
-
-  def _do_pack_unpack_test(self, tt):
-    """Do a single pack-unpack test.
-
-    Args:
-      tt: A _test_tuple defining the parameters of the test to do.
-
-    This test executes a graph that performs a pack of tower_grads
-    followed by an unpack and verifies that the shapes and values
-    of gradient tensors are unchanged, along with paired variables.
-    """
-    with ops.Graph().as_default():
-      tower_grads, consts, _, vrbls = self._init_tensors(
-          tt.num_devices, tt.in_shapes)
-      packed_tg, packing = allreduce.pack_small_tensors(
-          tower_grads, max_bytes=40, max_group=10)
-      unpacked_tg = allreduce.unpack_small_tensors(packed_tg, packing)
-      with self.test_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        packed = sess.run(packed_tg)
-        for d in range(0, tt.num_devices):
-          for t in range(0, len(tt.out_shapes)):
-            num_elts = 0
-            for dim in tt.out_shapes[t]:
-              num_elts = (num_elts or 1) * dim
-            self.assertTrue(np.array_equal(
-                np.array(range(tt.out_i[t], tt.out_i[t] + num_elts),
-                         dtype=np.float32).reshape(tt.out_shapes[t]),
-                packed[d][t][0]))
-        unpacked = sess.run(unpacked_tg)
-        for d in range(0, tt.num_devices):
-          for t in range(0, len(tt.in_shapes)):
-            self.assertTrue(np.array_equal(consts[d][t], unpacked[d][t][0]))
-            self.assertEqual(vrbls[d][t], unpacked_tg[d][t][1])
-
-  def testPackUnpack0(self):
-    self._do_pack_unpack_test(
-        self._test_tuple(num_devices=3,
-                         in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
-                         out_shapes=[[17], [12], [5, 5, 5]],
-                         out_i=[0, 17, 29]))
-
-  def testPackUnpack1(self):
-    self._do_pack_unpack_test(
-        self._test_tuple(num_devices=4,
-                         in_shapes=[[5, 5, 5], [2, 3], [5]],
-                         out_shapes=[[11], [5, 5, 5]],
-                         out_i=[125, 0]))
-
-  def testPackUnpack2(self):
-    self._do_pack_unpack_test(
-        self._test_tuple(num_devices=2,
-                         in_shapes=[[5, 5, 5], [2, 3], [1, 5], [7], [100]],
-                         out_shapes=[[18], [5, 5, 5], [100]],
-                         out_i=[125, 0, 143]))
-
-  def _do_all_reduce_pack_test(self, tt):
-    """Test that all-reduce results are the same with or without packing."""
-    with ops.Graph().as_default():
-      tower_grads, consts, _, _ = self._init_tensors(
-          tt.num_devices, tt.in_shapes)
-      dev_prefixes = ['/job:localhost']
-      num_workers = 1
-      alg = 'xring'
-      shards = 1
-      single_session = True
-      gpu_indices = range(0, tt.num_devices)
-      assert len(gpu_indices) == len(tower_grads)
-      no_pack_all_reduce = allreduce.sum_gradients_all_reduce(
-          single_session,
-          dev_prefixes, tower_grads, num_workers, alg, shards,
-          gpu_indices,
-          agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
-      packed_tg, packing = allreduce.pack_small_tensors(tower_grads, 100, 100)
-      packed_all_reduce = allreduce.sum_gradients_all_reduce(
-          single_session,
-          dev_prefixes, packed_tg, num_workers, alg, shards,
-          gpu_indices,
-          agg_small_grads_max_bytes=0, agg_small_grads_max_group=1)
-      unpacked_tg = allreduce.unpack_small_tensors(packed_all_reduce, packing)
-      with self.test_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        no_pack_values = sess.run(no_pack_all_reduce)
-        pack_unpack_values = sess.run(unpacked_tg)
-        for d in range(1, tt.num_devices):
-          for t in range(0, len(tt.in_shapes)):
-            self.assertTrue(np.allclose(no_pack_values[d][t][0],
-                                        tt.num_devices * consts[0][t]))
-            self.assertTrue(np.array_equal(no_pack_values[d][t][0],
-                                           pack_unpack_values[d][t][0]))
-
-  def testAllReducePacked0(self):
-    self._do_all_reduce_pack_test(
-        self._test_tuple(num_devices=3,
-                         in_shapes=[[8], [3, 3], [12], [5, 5, 5]],
-                         out_shapes=[[17], [12], [5, 5, 5]],
-                         out_i=[0, 17, 29]))
-
-  def testAllReducePacked1(self):
-    self._do_all_reduce_pack_test(
-        self._test_tuple(num_devices=2,
-                         in_shapes=[[8], [3, 3], [12], [5, 5, 5], [3], [4]],
-                         out_shapes=[[17], [7], [12], [5, 5, 5]],
-                         out_i=[0, 17, 29, 154, 157]))
-
-
-if __name__ == '__main__':
-  tf.disable_v2_behavior()
-  tf.test.main()
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/batch_allreduce.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/batch_allreduce.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Contains classes and functions for doing a single-machine batch all-reduce.
-
-An all-reduce is taking the reduction (typically a sum) of a list of tensors,
-each on a different device. The result must end up back on each device, which is
-where the word "all" comes from. In summary, each device starts with a single
-tensor, and ends up with the reduction of all tensors.
-
-A batch all-reduce is doing several independent all-reduces. When doing a batch
-all-reduce, care is taken to evenly distribute the reduction computations
-across devices and inter-device tensor transfers across device links.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# TODO(reedwm): Support distributed all-reduces in this file.
-# TODO(reedwm): Merge this code with allreduce.py, which contains some batch
-# all-reduce code that this file calls. allreduce.py also supports distributed
-# batch-reduce while this file only supports single-machine all-reduce.
-
-import abc
-
-import six
-import tensorflow.compat.v1 as tf
-
-from tensorflow.python.ops import data_flow_ops
-import allreduce
-import constants
-
-
-def _all_reduce_using_copy(tensors_across_devices, use_mean):
-  """Does an all-reduce of a list of tensors by copying to the current device.
-
-  The tensors are copied to the current device and then reduced.
-
-  Args:
-    tensors_across_devices: A list of tensors, each on a different device.
-    use_mean: Whether to take the mean of the tensors instead of a sum:
-  Returns:
-    A reduced tensor on the current device.
-  """
-  reduced_tensor = tf.add_n(tensors_across_devices)
-  if use_mean:
-    reduced_tensor *= 1 / len(tensors_across_devices)
-  return reduced_tensor
-
-
-@six.add_metaclass(abc.ABCMeta)
-class BatchAllReduceAlgorithm(object):
-  """Represents an algorithm for performing a batch all-reduce operation."""
-
-  def batch_all_reduce(self,
-                       all_device_tensors,
-                       num_splits,
-                       compact_tensors,
-                       defer_tensors,
-                       xla_compile=False):
-    """Performs a batch all-reduce.
-
-    The reduction done is a sum.
-
-    `all_device_tensors` is a list of list of tensors that will be batch
-    all-reduced. All tensors within a single inner list must be on the same
-    device. The nth element in each list, for any n, will be reduced together.
-    The return value is in the same form as `all_device_tensors`, except that
-    each tensor is reduced.
-
-    For example, if `all_device_tensors` is:
-    [[ A,  B  ],     # A and B are on GPU 0
-     [ C,  D  ]]     # C and D are on GPU 1
-
-    Then the return value will be:
-    [[ A+C,  B+D ],  # These two tensors are on GPU 0
-     [ A+C,  B+D ]]  # These two tensors are on GPU 1
-
-    Args:
-      all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
-        is a tensor where `i` is the device index and `j` is the tensor index.
-      num_splits: If not None, tensors will be concatenated and split into this
-        many pieces during the all-reduce, then split back into their original
-        shapes afterwards. Has no impact on correctness and can improve
-        performance. Requires all tensors to be the same type.
-      compact_tensors: If True, tensors are casted to fp16 before being all-
-        reduced. Improves performance, but hurts numerical stability.
-      defer_tensors: If True, every time the return value
-        `reduced_all_device_tensors` is evaluated, the result will be the
-        reduced tensors values of `all_device_tensors` from the previous session
-        run instead of the current session run, or zero on the first session
-        run. This can improve performance. When training neural networks,
-        deferring gradients often does not harm training, so this can be used to
-        improve performance.
-      xla_compile: If True, use XLA to compile gradients packing and unpacking
-        ops.
-
-    Returns:
-      reduced_all_device_tensors: A list in the same form as
-        `all_device_tensors`, except each tensor has been reduced.
-      warmup_ops: A list of ops needed to be run once before the all-reduce can
-        occur.
-    """
-
-    # Before all-reducing tensors, we do several preprocessing functions that
-    # can speed up the all-reduce. We undo these functions after all-reducing
-    # the tensors.
-
-    # all_device_packed_tensors is a 2-d list of tensors indexed by
-    # [device_id][tensor_id], holding packed tensors from all devices involved
-    # in all-reduce.
-    all_device_packed_tensors = []
-
-    # all_device_warmup_ops is a 2-d list of ops indexed by
-    # [device_id][tensor_id], holding warmup_ops that need to be run once before
-    # all-reduce can occur.
-    all_device_warmup_ops = []
-
-    # all_device_put_ops is a 2-d list of ops indexed by
-    # [device_id][tensor_id], holding put ops for deferred tensors. They will be
-    # called in each all-reduce step automatically due to control dependency.
-    all_device_put_ops = []
-
-    # packers is a list of _TensorPacker, one for each device involved in
-    # all-reduce.
-    packers = [
-        _TensorPacker(num_splits, compact_tensors) for _ in all_device_tensors
-    ]
-
-    for packer, device_tensors in zip(packers, all_device_tensors):
-
-      def pack_single_device_tensors(packer=packer,
-                                     device_tensors=device_tensors):
-        """Pack gradient tensors of a device."""
-        packed_tensors = packer.maybe_concat_tensors(device_tensors)
-        packed_tensors = packer.maybe_compact_tensors(packed_tensors)
-        # When xla_compile=False, defer tensors after concat for better
-        # performance.
-        if defer_tensors and not xla_compile:
-          packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
-              packed_tensors)
-          all_device_put_ops.append(put_ops)
-          all_device_warmup_ops.append(warmup_ops)
-        packed_tensors = packer.maybe_split_tensors(packed_tensors)
-        return packed_tensors
-
-      with tf.device(device_tensors[0].device):
-        if xla_compile:
-          packed_tensors = tf.xla.experimental.compile(
-              pack_single_device_tensors)
-          # When xla_compile=True, intermediate tensors in packing process are
-          # not materialized. Thus, we defer tensors after packing process is
-          # completed instead of in the middle of it.
-          if defer_tensors:
-            packed_tensors, put_ops, warmup_ops = defer_single_device_tensors(
-                packed_tensors)
-            all_device_put_ops.append(put_ops)
-            all_device_warmup_ops.append(warmup_ops)
-        else:
-          packed_tensors = pack_single_device_tensors()
-
-      all_device_packed_tensors.append(packed_tensors)
-
-    # Perform all-reduce on packed tensors.
-    all_device_tensors = self._do_batch_all_reduce(all_device_packed_tensors)
-
-    all_device_unpacked_tensors = []
-    for packer, device_tensors in zip(packers, all_device_tensors):
-
-      def unpack_single_device_tensors(packer=packer,
-                                       device_tensors=device_tensors):
-        """Unpack gradient tensors of a device."""
-        unpacked_tensors = packer.undo_maybe_split_tensors(device_tensors)
-        unpacked_tensors = packer.undo_maybe_compact_tensors(unpacked_tensors)
-        unpacked_tensors = packer.undo_maybe_concat_tensors(unpacked_tensors)
-        return unpacked_tensors
-
-      with tf.device(device_tensors[0].device):
-        if xla_compile:
-          unpacked_device_tensor = tf.xla.experimental.compile(
-              unpack_single_device_tensors)
-        else:
-          unpacked_device_tensor = unpack_single_device_tensors()
-
-      all_device_unpacked_tensors.append(unpacked_device_tensor)
-
-    # Note: There is no undo operation for deferring tensors. But we do need to
-    # call _add_put_op_control_deps at the end if we deferred the tensors.
-    if defer_tensors:
-      all_device_unpacked_tensors = _add_put_op_control_deps(
-          all_device_unpacked_tensors, num_splits, all_device_put_ops)
-
-    return all_device_unpacked_tensors, all_device_warmup_ops
-
-  @abc.abstractmethod
-  def _do_batch_all_reduce(self, all_device_tensors):
-    """Performs a batch all-reduce.
-
-    Unlike `self.batch_all_reduce`, this does not do any preprocessing of the
-    tensors.
-
-    Args:
-      all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]`
-        is a tensor where `i` is the device index and `j` is the tensor index.
-    Returns:
-      reduced_all_device_tensors: A list in the same form as
-        `all_device_tensors`, except each tensor has been reduced.
-    """
-    pass
-
-
-class CopyToDeviceAlgorithm(BatchAllReduceAlgorithm):
-  """An algorithm that copies tensors to be reduced to a specific device."""
-
-  def __init__(self, devices_to_reduce_on, use_mean=False):
-    self._devices = devices_to_reduce_on
-    self._use_mean = use_mean
-
-  def _do_batch_all_reduce(self, all_device_tensors):
-    reduced_tensors = []
-    for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
-      with tf.device(self._devices[i % len(self._devices)]):
-        reduced_tensor = _all_reduce_using_copy(tensors_across_devices,
-                                                self._use_mean)
-        reduced_tensors.append(reduced_tensor)
-    # The tensors will be brought back to each device once they are used.
-    return [reduced_tensors] * len(all_device_tensors)
-
-
-class HierarchicalCopyAlgorithm(BatchAllReduceAlgorithm):
-  """An algorithm that uses hierarchical copies. This is only optimized for
-  eight devices connected in NetworkTopology.DGX1 or NetworkTopology.GCP_V100
-  topology.
-  """
-
-  def __init__(self, network_topology):
-    """Initializer for HierarchicalCopyAlgorithm.
-
-    Args:
-      network_topology: An instance of Enum class constants.NetworkTopology.
-    """
-    self._network_topology = network_topology
-
-  def _do_batch_all_reduce(self, all_device_tensors):
-    avail_devices = [device_tensors[0].device
-                     for device_tensors in all_device_tensors]
-    reduced_tensors = []
-    num_devices = len(avail_devices)
-    group_size = num_devices // 2
-    for i, tensors_across_devices in enumerate(zip(*all_device_tensors)):
-      group_0_main_device, group_1_main_device = self.__get_main_devices(
-          i, num_devices)
-      if group_0_main_device < group_size:
-        group_0_begin = 0
-        group_1_begin = group_size
-      else:
-        group_0_begin = group_size
-        group_1_begin = 0
-
-      # Reduce the first group.
-      group_0_tensors = tensors_across_devices[group_0_begin:
-                                               group_0_begin + group_size]
-      with tf.device(avail_devices[group_0_main_device]):
-        group_0_reduced_tensor = _all_reduce_using_copy(group_0_tensors, False)
-
-      # Reduce the second group.
-      group_1_tensors = tensors_across_devices[group_1_begin:
-                                               group_1_begin + group_size]
-      with tf.device(avail_devices[group_1_main_device]):
-        group_1_reduced_tensor = _all_reduce_using_copy(group_1_tensors, False)
-
-      # Reduce between the groups.
-      with tf.device(avail_devices[group_0_main_device]):
-        total_reduced_tensor = _all_reduce_using_copy(
-            [group_0_reduced_tensor, group_1_reduced_tensor], False)
-
-      # Broadcast the result back into the root of each group.
-      with tf.device(avail_devices[group_0_main_device]):
-        group_0_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
-      with tf.device(avail_devices[group_1_main_device]):
-        group_1_reduced_tensor_bcast = tf.identity(total_reduced_tensor)
-
-      reduced_tensors_bcast = []
-      for j in range(len(tensors_across_devices)):
-        with tf.device(avail_devices[j]):
-          # Broadcast the result back to each member in the group from the root.
-          if (group_0_main_device < group_size) == (j < group_size):
-            src_device_tensor = group_0_reduced_tensor_bcast
-          else:
-            src_device_tensor = group_1_reduced_tensor_bcast
-          reduced_tensors_bcast.append(tf.identity(src_device_tensor))
-
-      reduced_tensors.append(reduced_tensors_bcast)
-
-    reduced_tensors = list(zip(*reduced_tensors))
-    return reduced_tensors
-
-  def __get_main_devices(self, tensor_index, num_devices):
-    """Returns the pair of main devices to use for initial reduction.
-
-    Args:
-      tensor_index: Index of the current tensor in the list of tensors to copy.
-      num_devices: Total number of devices.
-
-    Returns:
-      A tuple containing pair of main device indices for the initial
-      reduction. Then, the first element of the tuple should be used for the
-      final reduction.
-
-    Raises:
-      ValueError: Invalid input arguments.
-    """
-    if self._network_topology == constants.NetworkTopology.DGX1:
-      return tensor_index % num_devices, (tensor_index +
-                                          (num_devices // 2)) % num_devices
-    elif self._network_topology == constants.NetworkTopology.GCP_V100:
-      if num_devices != 8:
-        raise ValueError('HierarchicalCopy only supports eight devices in %s.' %
-                         self._network_topology)
-      # TODO(hinsu): Generalize main device indices to handle any other
-      # isomorphic connection graph that connects two cliques using connections
-      # other than 0-5 and 2-7.
-      main_device_pairs = [(0, 5), (2, 7), (5, 0), (7, 2)]
-      return main_device_pairs[tensor_index % len(main_device_pairs)]
-    else:
-      # TODO(reedwm): make this logic more general for arbitrary topology.
-      raise ValueError(
-          'HierarchicalCopy is not supported for %s network topology.' %
-          self._network_topology)
-
-
-class AllReduceSpecAlgorithm(BatchAllReduceAlgorithm):
-  """An algorithm that uses an all reduce spec."""
-
-  def __init__(self, all_reduce_spec, gpu_indices, agg_small_grads_max_bytes,
-               agg_small_grads_max_group):
-    spec = allreduce.parse_all_reduce_spec(all_reduce_spec)
-    if len(spec) != 1:
-      raise ValueError(
-          'Replicated mode does not support hybrid all-reduce strategies')
-    self._all_reduce_spec = spec[0]
-    self._gpu_indices = gpu_indices
-    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
-    self._agg_small_grads_max_group = agg_small_grads_max_group
-
-  def _do_batch_all_reduce(self, all_device_tensors):
-    # TODO(reedwm): Merge allreduce.sum_gradients_all_reduce with the other
-    # gradient aggregation code, since gradient aggregation is doing an all
-    # reduce. Currently, we do gradient repacking in two different places.
-    # TODO(reedwm): Change the allreduce code to reduce tensors instead of
-    # tower_grads.
-    tower_grads = [[(t, None) for t in device_tensors]
-                   for device_tensors in all_device_tensors]
-    aggregated_device_grads = allreduce.sum_gradients_all_reduce(
-        False,  # single_session
-        ['/job:localhost'],
-        tower_grads,
-        1,
-        self._all_reduce_spec.alg,
-        self._all_reduce_spec.shards,
-        self._gpu_indices,
-        agg_small_grads_max_bytes=self._agg_small_grads_max_bytes,
-        agg_small_grads_max_group=self._agg_small_grads_max_group)
-    return [[t for t, _ in grad_vars] for grad_vars in aggregated_device_grads]
-
-
-def algorithm_from_params(params):
-  """Returns a BatchAllReduceAlgorithm from a Params tuple."""
-  if params.all_reduce_spec:
-    if params.gpu_indices:
-      gpu_indices = [int(x) for x in params.gpu_indices.split(',')]
-    else:
-      gpu_indices = [x for x in range(params.num_gpus)]
-    return AllReduceSpecAlgorithm(params.all_reduce_spec, gpu_indices,
-                                  params.agg_small_grads_max_bytes,
-                                  params.agg_small_grads_max_group)
-  elif params.hierarchical_copy:
-    return HierarchicalCopyAlgorithm(params.network_topology)
-  else:
-    if params.local_parameter_device == 'gpu':
-      devices_to_reduce_on = ['/gpu:%d' % i for i in range(params.num_gpus)]
-    else:
-      devices_to_reduce_on = ['/cpu:0']
-    return CopyToDeviceAlgorithm(devices_to_reduce_on)
-
-
-def _apply_to_all_device_tensors(all_device_tensors, apply_func, colocate=True):
-  """Applies a function to each tensor in `all_device_tensors`.
-
-  A new list of lists of tensors is returned, where every tensor in
-  `all_device_tensors` has had `apply_func` called on it. `all_device_tensors`
-  is not modified.
-
-  Args:
-    all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
-      a tensor where `i` is the device index and `j` is the tensor index.
-    apply_func: A function taking in three arguments: tensor, device_index,
-      tensor_index, and returning a modified tensor.
-      `tensor` is `all_device_tensors[device_index][tensor_index]`.
-    colocate: If True, apply_func will be run under context manager colocated
-      with it's input tensor.
-  Returns:
-    A list in the same form as `all_device_tensors`, except each tensor has had
-    `apply_func` called on it.
-  """
-  new_all_device_tensors = []
-  for device_index, device_tensors in enumerate(all_device_tensors):
-    new_device_tensors = []
-    for tensor_index, t in enumerate(device_tensors):
-      if colocate:
-        with tf.colocate_with(t):
-          new_t = apply_func(t, device_index, tensor_index)
-      else:
-        new_t = apply_func(t, device_index, tensor_index)
-      new_device_tensors.append(new_t)
-    new_all_device_tensors.append(new_device_tensors)
-  return new_all_device_tensors
-
-
-def _defer_tensor(tensor):
-  """Defers the retrieval of a tensor.
-
-  The tensor is put into a StagingArea, and the return value is the
-  retrieval of the tensor from the StagingArea. The effect is that the
-  tensor returned from this function is the tensor that was put in the
-  StagingArea for the previous Session.run() call.
-
-  Args:
-    tensor: The tensor to defer for one step.
-
-  Returns:
-    deferred_tensor: The tensor deferred for one step.
-    put_op: An op to put `tensor` in the StagingArea. Must be run every step
-      that `deferred_tensor` is run.
-    warmup_op: A warmup op that should be called before the first step. Puts
-      a zero tensor into the StagingArea.
-  """
-  tensor_stage = data_flow_ops.StagingArea([tensor.dtype], [tensor.shape])
-  put_op = tensor_stage.put([tensor])
-  warmup_op = tensor_stage.put([tf.zeros(tensor.shape, dtype=tensor.dtype)])
-
-  # Fetch the next tensor to use.
-  (tensor,) = tensor_stage.get()
-  return tensor, put_op, warmup_op
-
-
-def defer_single_device_tensors(device_tensors):
-  """Defer tensors (gradients in this case) from a single device.
-
-  Args:
-    device_tensors: A list of gradients tensors from a single device to defer.
-
-  Returns:
-    deferred_tensors: A list of tensors deferred for one step.
-    put_ops: A list of ops that put `tensors` in the StagingAreas. Must be run
-      every step that `deferred_tensors` is run.
-    warmup_ops: Warmup ops that should be called before the first step. Puts
-      zero tensors into the StagingArea.
-  """
-  put_ops = []
-  warmup_ops = []
-  deferred_tensors = []
-
-  for tensor in device_tensors:
-    deferred_tensor, put_op, warmup_op = _defer_tensor(tensor)
-    deferred_tensors.append(deferred_tensor)
-    put_ops.append(put_op)
-    warmup_ops.append(warmup_op)
-
-  return deferred_tensors, put_ops, warmup_ops
-
-
-def _add_put_op_control_deps(all_device_tensors, num_splits, put_ops):
-  """Add control dependencies from `put_ops` to `all_device_tensors`.
-
-  This should only be called when deferred tensors are being used.
-
-  The control dependencies are added so that the put ops are run whenever
-  `all_device_tensors` is run. That way, the caller does not have to explicitly
-  run the put ops.
-
-  Args:
-    all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
-      a tensor where `i` is the device index and `j` is the tensor index.
-    num_splits: The number of splits that were used for the all-reduce.
-    put_ops: A list of put ops from deferring the tensors.
-  Returns:
-    A list in the same form as `all_device_tensors`, except each tensor has a
-    control dependency on an op in `put_ops`.
-
-  """
-  def apply_func(tensor, device_index, tensor_index):
-    if num_splits == 0:
-      deps = [put_ops[device_index][tensor_index]]
-    else:
-      deps = put_ops[device_index]
-    assert len(deps) == 1
-    with tf.control_dependencies(deps):
-      return tf.identity(tensor, name='control_dependency')
-  return _apply_to_all_device_tensors(all_device_tensors, apply_func)
-
-
-class _TensorPacker(object):
-  """Packs and unpacks tensors into groups.
-
-  This class first concatenates a set of tensors, then split the concatenated
-  tensor into a small number of chunks. This is useful for all-reducing tensors,
-  as doing a small number of all-reduces on large tensors can be faster than
-  doing a large number of all-reduces on small tensors.
-
-  It also provides option to compact tensors by casting them to fp16, for better
-  all-reduce performance.
-
-  This class maintains states of processed tensors like shapes and types. So
-  each packer can only be used to pack and unpack one list of tensors. If you
-  need to pack multiple lists of tensors (say from multiple devices), then you
-  need multiple _TensorPacker object, one for each device.
-  """
-
-  def __init__(self, num_splits, compact):
-    """Initializes the _TensorPacker.
-
-    Args:
-      num_splits: The number of tensors to split the concatenated tensor into.
-        The batch all-reduce will consist of `num_splits` all-reduces. if None
-        or zero, tensors are not split or concatenated.
-      compact: If True, tensors are casted to fp16 during packing and casted
-        back to their original dtypes during unpacking.
-    """
-    self._num_splits = num_splits
-    self._compact = compact
-    self._before_compact_dtypes = []
-
-  def maybe_concat_tensors(self, device_tensors):
-    """Concatenate tensors into a single tensor."""
-    if not self._num_splits:
-      return device_tensors
-
-    flat_tensors = [tf.reshape(t, [-1]) for t in device_tensors]
-    self._orig_shapes = [t.shape for t in device_tensors]
-    self._orig_sizes = [s.num_elements() for s in self._orig_shapes]
-    # All shapes must be fully defined.
-    assert None not in self._orig_sizes
-    concatenated_grad = tf.concat(flat_tensors, 0)
-    return [concatenated_grad]
-
-  def maybe_split_tensors(self, concatenated_tensor):
-    """Split concatenated tensor into `num_splits` pieces."""
-    if not self._num_splits:
-      return concatenated_tensor
-
-    if len(concatenated_tensor) != 1:
-      raise RuntimeError('tensors must be concatenated via '
-                         'maybe_concat_tensors() before splitting')
-
-    concatenated_tensor = concatenated_tensor[0]
-    total_tensor_size = concatenated_tensor.shape.num_elements()
-    split_size = total_tensor_size // self._num_splits
-    split_size_last = total_tensor_size - split_size * (self._num_splits - 1)
-    split_sizes = [split_size] * (self._num_splits - 1) + [split_size_last]
-    tensor_packs = tf.split(concatenated_tensor, split_sizes)
-    return tensor_packs
-
-  def undo_maybe_split_tensors(self, tensor_packs):
-    """Undo maybe_split_tensors()."""
-    if not self._num_splits:
-      return tensor_packs
-
-    return [tf.concat(tensor_packs, 0)]
-
-  def undo_maybe_concat_tensors(self, concatenated_tensor):
-    """Undo maybe_concat_tensors()."""
-    if not self._num_splits:
-      return concatenated_tensor
-
-    if len(concatenated_tensor) != 1:
-      raise RuntimeError(
-          'undo_maybe_split_tensors() must be called before '
-          'undo_maybe_concat_tensors when num_splits is greater than 1')
-    concatenated_tensor = concatenated_tensor[0]
-
-    tensors_with_sizes = tf.split(concatenated_tensor,
-                                  self._orig_sizes)
-    tensors_with_shapes = [
-        tf.reshape(grad, shape) for grad, shape in zip(
-            tensors_with_sizes, self._orig_shapes)
-    ]
-    return tensors_with_shapes
-
-  def maybe_compact_tensors(self, device_tensors):
-    """Cast tensors to fp16 and store their original types."""
-    if not self._compact:
-      return device_tensors
-
-    if self._before_compact_dtypes:
-      raise RuntimeError('maybe_compact_tensors can only be called once.')
-
-    self._before_compact_dtypes = [t.dtype for t in device_tensors]
-    compact_tensors = [tf.cast(t, tf.float16) for t in device_tensors]
-
-    return compact_tensors
-
-  def undo_maybe_compact_tensors(self, compact_tensors):
-    """Undo maybe_compact_tensors()."""
-    if not self._compact:
-      return compact_tensors
-
-    if not self._before_compact_dtypes:
-      raise RuntimeError('maybe_compact_tensors() must be called before '
-                         'undo_maybe_compact_tensors()')
-
-    device_tensors = [
-        tf.cast(t, dtype)
-        for t, dtype in zip(compact_tensors, self._before_compact_dtypes)
-    ]
-    return device_tensors
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/benchmark_cnn.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/benchmark_cnn.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorFlow benchmark library.
-
-See the README for more information.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-from collections import namedtuple
-import contextlib
-import math
-import multiprocessing
-import os
-import re
-import threading
-import time
-import traceback
-
-from absl import flags as absl_flags
-import numpy as np
-
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow.compat.v1 as tf
-
-# pylint: disable=g-direct-tensorflow-import
-import cnn_util
-import constants
-import datasets
-import flags
-import mlperf
-import variable_mgr
-import variable_mgr_util
-from cnn_util import log_fn
-from models import model_config
-from platforms import util as platforms_util
-from google.protobuf import text_format
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python import debug as tf_debug
-from tensorflow.python.client import timeline
-from tensorflow.python.framework import graph_util
-from tensorflow.python.framework import graph_util_impl
-from tensorflow.python.framework import importer
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.util import nest
-
-
-_DEFAULT_NUM_BATCHES = 100
-
-
-# GraphInfo encapsulates the tensors/ops that we care about after building a
-# graph. We use them to benchmark the graph.
-GraphInfo = namedtuple(  # pylint: disable=invalid-name
-    'GraphInfo',
-    [
-        # Ops that produce the input batches (before preprocessing).
-        'input_producer_op',
-        # Ops that adds the preprocessed images to the staging areas
-        'enqueue_ops',
-        # Fetches of sess.run()
-        'fetches',
-        # Op that performs synchronization in distributed mode
-        'execution_barrier',
-        # The global step variable
-        'global_step',
-        # Group of ops that perform per-device initialization work
-        'local_var_init_op_group',
-        # Op to produce summaries
-        'summary_op'
-    ])
-
-
-# InputProcessingInfo contains various sources of inputs which will be later fed
-# into the model. If synthetic data is used, all three fields are None.
-InputProcessingInfo = namedtuple(
-    'InputProcessingInfo',
-    [
-        # The first two fields are non-None iff datasets prefetching is not
-        # used.
-
-        # Ops that produce the input batches.
-        'input_producer_op',
-        # A list of StagingArea for each device.
-        'input_producer_stages',
-
-        # Input produced using multi device iterator. Non-None iff datasets
-        # prefetching is used
-        'multi_device_iterator_input'
-    ])
-
-
-# TODO(reedwm): add upper_bound and lower_bound to appropriate integer and
-# float flags, and change certain string flags to enum flags.
-
-flags.DEFINE_string('model', 'trivial',
-                    'Name of the model to run, the list of supported models '
-                    'are defined in models/model.py')
-# The code will first check if it's running under benchmarking mode
-# or evaluation mode, depending on 'eval':
-# Under the evaluation mode, this script will read a saved model,
-#   and compute the accuracy of the model against a validation dataset.
-#   Additional ops for accuracy and top_k predictors are only used under
-#   this mode.
-# Under the benchmarking mode, user can specify whether nor not to use
-#   the forward-only option, which will only compute the loss function.
-#   forward-only cannot be enabled with eval at the same time.
-flags.DEFINE_boolean('eval', False, 'whether use eval or benchmarking')
-flags.DEFINE_integer('eval_interval_secs', 0,
-                     'How often to run eval on saved checkpoints. Usually the '
-                     'same as save_model_secs from the corresponding training '
-                     'run. Pass 0 to eval only once.')
-flags.DEFINE_integer('eval_during_training_every_n_steps', None,
-                     'Every n steps during training, pause training, run '
-                     'evaluation, then resume training. Must not be used with '
-                     '--eval, as unlike --eval, this option causes both '
-                     'training and eval to be done. This may take slightly '
-                     'more GPU memory than running just training or evaluation '
-                     'alone. It also may slightly slow down training, even '
-                     'when not taking into account the additional time to '
-                     'evaluate.', lower_bound=1)
-flags.DEFINE_float('eval_during_training_every_n_epochs', None,
-                   'After every n training epochs, pause training, run '
-                   'evaluation, then resume training. See '
-                   '--eval_during_training_every_n_steps for more information.')
-flags.DEFINE_list('eval_during_training_at_specified_steps', [],
-                  'Specify a list of training steps, pause training at each of '
-                  'these steps, run evaluation, then resume training. See '
-                  '--eval_during_training_every_n_steps for more information.')
-flags.DEFINE_list('eval_during_training_at_specified_epochs', [],
-                  'Specify a list of training epochs, pause training after '
-                  'each of these epochs, run evaluation, then resume training. '
-                  'See --eval_during_training_every_n_steps for more '
-                  'information.')
-flags.DEFINE_boolean('forward_only', False,
-                     'whether use forward-only or training for benchmarking')
-flags.DEFINE_boolean('freeze_when_forward_only', False,
-                     'whether to freeze the graph when in forward-only mode.')
-flags.DEFINE_boolean('print_training_accuracy', False,
-                     'whether to calculate and print training accuracy during '
-                     'training')
-flags.DEFINE_integer('batch_size', 0, 'batch size per compute device')
-flags.DEFINE_integer('eval_batch_size', 0, 'eval batch size per compute device')
-flags.DEFINE_integer('batch_group_size', 1,
-                     'number of groups of batches processed in the image '
-                     'producer.')
-flags.DEFINE_integer('num_batches', None, 'number of batches to run, excluding '
-                     'warmup. Defaults to %d' % _DEFAULT_NUM_BATCHES)
-flags.DEFINE_integer('num_eval_batches', None,
-                     'number of eval batches to run, excluding warmup. '
-                     'Defaults to --num_batches')
-flags.DEFINE_float('num_epochs', None,
-                   'number of epochs to run, excluding warmup. '
-                   'This and --num_batches cannot both be specified.')
-flags.DEFINE_float('num_eval_epochs', None,
-                   'number of eval epochs to run, excluding warmup. '
-                   'Defaults to --num_epochs')
-flags.DEFINE_float('stop_at_top_1_accuracy', None,
-                   'If set, stops training after the evaluation accuracy hits '
-                   'this number. Can only be used with one of the '
-                   '--eval_during_training_* flags.')
-flags.DEFINE_boolean('collect_eval_results_async', False,
-                     'If True, start a separate process to postprocess eval '
-                     'results asynchronously. This currently only works with '
-                     'the SSD model.')
-flags.DEFINE_integer('num_warmup_batches', None,
-                     'number of batches to run before timing')
-flags.DEFINE_integer('autotune_threshold', None,
-                     'The autotune threshold for the models')
-# TODO(tucker): change num_gpus to num_devices
-flags.DEFINE_integer('num_gpus', 1, 'the number of GPUs to run on')
-flags.DEFINE_string('gpu_indices', '', 'indices of worker GPUs in ring order')
-flags.DEFINE_integer('display_every', 10,
-                     'Number of local steps after which progress is printed '
-                     'out')
-flags.DEFINE_float('display_perf_ewma', None,
-                   'If set, display numbers of images/sec using exponentially '
-                   'weighted moving avearge with the specified weight, which '
-                   'defines how much current value contributes to the reported '
-                   'average. Increasing weight makes the reported performance '
-                   'number reflect more about the real-time speed instead of '
-                   'the entire history', lower_bound=0, upper_bound=1)
-flags.DEFINE_string('data_dir', None,
-                    'Path to dataset in TFRecord format (aka Example '
-                    'protobufs). If not specified, synthetic data will be '
-                    'used.')
-flags.DEFINE_string('data_name', None,
-                    'Name of dataset: imagenet or cifar10. If not specified, '
-                    'it is automatically guessed based on data_dir.')
-flags.DEFINE_string('resize_method', 'bilinear',
-                    'Method for resizing input images: crop, nearest, '
-                    'bilinear, bicubic, area, or round_robin. The `crop` mode '
-                    'requires source images to be at least as large as the '
-                    'network input size. The `round_robin` mode applies '
-                    'different resize methods based on position in a batch in '
-                    'a round-robin fashion. Other modes support any sizes and '
-                    'apply random bbox distortions before resizing (even with '
-                    'distortions=False).')
-flags.DEFINE_boolean('distortions', False,
-                     'Enable/disable distortions during image preprocessing. '
-                     'These include bbox and color distortions.')
-flags.DEFINE_boolean('use_datasets', True,
-                     'Enable use of datasets for input pipeline')
-flags.DEFINE_string('input_preprocessor', 'default',
-                    'Name of input preprocessor. The list of supported input '
-                    'preprocessors are defined in preprocessing.py.')
-flags.DEFINE_string('gpu_thread_mode', 'gpu_private',
-                    'Methods to assign GPU host work to threads. '
-                    'global: all GPUs and CPUs share the same global threads; '
-                    'gpu_private: a private threadpool for each GPU; '
-                    'gpu_shared: all GPUs share the same threadpool.')
-flags.DEFINE_integer('per_gpu_thread_count', 0,
-                     'The number of threads to use for GPU. Only valid when '
-                     'gpu_thread_mode is not global.')
-flags.DEFINE_boolean('hierarchical_copy', False,
-                     'Use hierarchical copies. Currently only optimized for '
-                     'use on a DGX-1 with 8 GPUs and may perform poorly on '
-                     'other hardware. Requires --num_gpus > 1, and only '
-                     'recommended when --num_gpus=8')
-# TODO(hinsu): Support auto-detection of the network topology while still
-# retaining the ability to specify a particular topology for debugging.
-flags.DEFINE_enum(
-    'network_topology', constants.NetworkTopology.DGX1,
-    (constants.NetworkTopology.DGX1, constants.NetworkTopology.GCP_V100),
-    'Network topology specifies the topology used to connect multiple devices. '
-    'Network topology is used to decide the hierarchy to use for the '
-    'hierarchical_copy.')
-flags.DEFINE_integer('gradient_repacking', 0, 'Use gradient repacking. It '
-                     'currently only works with replicated mode. At the end of '
-                     'each step, it repacks the gradients for more efficient '
-                     'cross-device transportation. A non-zero value specifies '
-                     'the number of split packs that will be formed.',
-                     lower_bound=0)
-flags.DEFINE_boolean('compact_gradient_transfer', True, 'Compact gradient'
-                     'as much as possible for cross-device transfer and '
-                     'aggregation.')
-flags.DEFINE_enum('variable_consistency', 'strong', ('strong', 'relaxed'),
-                  'The data consistency for trainable variables. With strong '
-                  'consistency, the variable always have the updates from '
-                  'previous step. With relaxed consistency, all the updates '
-                  'will eventually show up in the variables. Likely one step '
-                  'behind.')
-flags.DEFINE_boolean('datasets_repeat_cached_sample', False,
-                     'Enable use of a special datasets pipeline that reads a '
-                     'single TFRecord into memory and repeats it infinitely '
-                     'many times. The purpose of this flag is to make it '
-                     'possible to write regression tests that are not '
-                     'bottlenecked by CNS throughput. '
-                     'Use datasets_use_caching to cache input data.')
-flags.DEFINE_enum('local_parameter_device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
-                  'Device to use as parameter server: cpu or gpu. For '
-                  'distributed training, it can affect where caching of '
-                  'variables happens.')
-flags.DEFINE_enum('device', 'gpu', ('cpu', 'gpu', 'CPU', 'GPU'),
-                  'Device to use for computation: cpu or gpu')
-flags.DEFINE_enum('data_format', 'NCHW', ('NHWC', 'NCHW'),
-                  'Data layout to use: NHWC (TF native) or NCHW (cuDNN '
-                  'native, requires GPU).')
-flags.DEFINE_integer('num_intra_threads', None,
-                     'Number of threads to use for intra-op parallelism. If '
-                     'set to 0, the system will pick an appropriate number. '
-                     'None is the same as 0 except that it disables intra-op '
-                     'parallelism on a GPU.')
-flags.DEFINE_integer('num_inter_threads', 0,
-                     'Number of threads to use for inter-op parallelism. If '
-                     'set to 0, the system will pick an appropriate number.')
-flags.DEFINE_boolean('use_numa_affinity', False,
-                     'Whether to turn on NUMA affinity for CPU devices. '
-                     'This is probably only useful when --device=cpu.')
-flags.DEFINE_string('trace_file', '',
-                    'Enable TensorFlow tracing and write trace to this file.')
-flags.DEFINE_boolean('use_chrome_trace_format', True,
-                     'If True, the trace_file, if specified, will be in a '
-                     'Chrome trace format. If False, then it will be a '
-                     'StepStats raw proto.')
-_NUM_STEPS_TO_PROFILE = 10
-_NUM_OPS_TO_PRINT = 20
-flags.DEFINE_string('tfprof_file', None,
-                    'If specified, write a tfprof ProfileProto to this file. '
-                    'The performance and other aspects of the model can then '
-                    'be analyzed with tfprof. See '
-                    'https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/g3doc/command_line.md '  # pylint: disable=line-too-long
-                    'for more info on how to do this. The first %d steps '
-                    'are profiled. Additionally, the top %d most time '
-                    'consuming ops will be printed.\n'
-                    'Note: profiling with tfprof is very slow, but most of the '
-                    'overhead is spent between steps. So, profiling results '
-                    'are more accurate than the slowdown would suggest.' %
-                    (_NUM_STEPS_TO_PROFILE, _NUM_OPS_TO_PRINT))
-flags.DEFINE_string('graph_file', None,
-                    'Write the model\'s graph definition to this file. '
-                    'Defaults to binary format unless filename ends in "txt".')
-flags.DEFINE_string('partitioned_graph_file_prefix', None,
-                    'If specified, after the graph has been partitioned and '
-                    'optimized, write out each partitioned graph to a file '
-                    'with the given prefix.')
-flags.DEFINE_enum('optimizer', 'sgd', ('momentum', 'sgd', 'rmsprop', 'adam'),
-                  'Optimizer to use')
-flags.DEFINE_float('init_learning_rate', None,
-                   'Initial learning rate for training.')
-flags.DEFINE_string('piecewise_learning_rate_schedule', None,
-                    'Specifies a piecewise learning rate schedule based on the '
-                    'number of epochs. This is the form LR0;E1;LR1;...;En;LRn, '
-                    'where each LRi is a learning rate and each Ei is an epoch '
-                    'indexed from 0. The learning rate is LRi if the '
-                    'E(i-1) <= current_epoch < Ei. For example, if this '
-                    'paramater is 0.3;10;0.2;25;0.1, the learning rate is 0.3 '
-                    'for the first 10 epochs, then is 0.2 for the next 15 '
-                    'epochs, then is 0.1 until training ends.')
-flags.DEFINE_float('num_epochs_per_decay', 0,
-                   'Steps after which learning rate decays. If 0, the learning '
-                   'rate does not decay.')
-flags.DEFINE_float('learning_rate_decay_factor', 0,
-                   'Learning rate decay factor. Decay by this factor every '
-                   '`num_epochs_per_decay` epochs. If 0, learning rate does '
-                   'not decay.')
-flags.DEFINE_float('num_learning_rate_warmup_epochs', 0,
-                   'Slowly increase to the initial learning rate in the first '
-                   'num_learning_rate_warmup_epochs linearly.')
-flags.DEFINE_float('minimum_learning_rate', 0,
-                   'The minimum learning rate. The learning rate will '
-                   'never decay past this value. Requires `learning_rate`, '
-                   '`num_epochs_per_decay` and `learning_rate_decay_factor` to '
-                   'be set.')
-flags.DEFINE_float('resnet_base_lr', None, "Base learning rate at bs=256. Only "
-                   "relevant when training ResNet and utilizing the model's "
-                   "learning rate heuristic (get_learning_rate).")
-flags.DEFINE_float('momentum', 0.9, 'Momentum for training.')
-flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')
-flags.DEFINE_float('rmsprop_momentum', 0.9, 'Momentum in RMSProp.')
-flags.DEFINE_float('rmsprop_epsilon', 1.0, 'Epsilon term for RMSProp.')
-flags.DEFINE_float('adam_beta1', 0.9, 'Beta2 term for the Adam optimizer')
-flags.DEFINE_float('adam_beta2', 0.999, 'Beta2 term for the Adam optimizer')
-flags.DEFINE_float('adam_epsilon', 1e-8, 'Epsilon term for the Adam optimizer')
-flags.DEFINE_float('gradient_clip', None,
-                   'Gradient clipping magnitude. Disabled by default.')
-flags.DEFINE_float('weight_decay', 0.00004,
-                   'Weight decay factor for training.')
-flags.DEFINE_float('gpu_memory_frac_for_testing', 0,
-                   'If non-zero, the fraction of GPU memory that will be used. '
-                   'Useful for testing the benchmark script, as this allows '
-                   'distributed mode to be run on a single machine. For '
-                   'example, if there are two tasks, each can be allocated '
-                   '~40 percent of the memory on a single machine. This is '
-                   'also useful for using unified memory, as this can be set '
-                   'above 1 to oversubscribe the GPU using unified memory.',
-                   lower_bound=0.)
-flags.DEFINE_boolean('use_unified_memory', None,
-                     'If True, allocate unified memory enabling larger models '
-                     'to fit in available device RAM.')
-flags.DEFINE_boolean('timestamped_allocator', False,
-                     'If True marks free BFCAllocator::Chunks with time '
-                     'at which they are freed which can allow more efficient '
-                     'memory allocation in cases like RDMA networking.')
-flags.DEFINE_integer('gpu_kt_max_interval', 0,
-                     'If > 0, the maximum number of GPU Ops that may be queued '
-                     'in a row without also queuing a tracking event.')
-flags.DEFINE_integer('gpu_kt_max_bytes', 0,
-                     'If > 0, the maximum number of bytes '
-                     'of GPU memory that may be allocated by sequential '
-                     'GPU Ops without queuing a tracking event.')
-flags.DEFINE_integer('gpu_kt_max_pending', 0,
-                     'If > 0 no more than this many GPU tracking events may be '
-                     'outstanding at any time.  When this limit is reached '
-                     'launch of additional kernels will stall until an '
-                     'outstanding event completes.')
-flags.DEFINE_boolean('use_tf_layers', True,
-                     'If True, use tf.layers for neural network layers. This '
-                     'should not affect performance or accuracy in any way.')
-flags.DEFINE_integer('tf_random_seed', 1234,
-                     'The TensorFlow random seed. Useful for debugging NaNs, '
-                     'as this can be set to various values to see if the NaNs '
-                     'depend on the seed.')
-flags.DEFINE_string('debugger', None,
-                    'If set, use the TensorFlow debugger. If set to "cli", use '
-                    'the local CLI debugger. Otherwise, this must be in the '
-                    'form hostname:port (e.g., localhost:7007) in which case '
-                    'the experimental TensorBoard debugger will be used')
-flags.DEFINE_boolean('use_python32_barrier', False,
-                     'When on, use threading.Barrier at Python 3.2.')
-
-flags.DEFINE_boolean('ml_perf', False,
-                     'When True, change how the Imagenet input pipeline works '
-                     'slightly to meet the MLPerf compliance rules. This slows '
-                     'down the input pipeline. Without this option, at the end '
-                     'of the input pipeline, the image is divided by 127.5, '
-                     'then 1.0 is subtracted from it, bringing the image '
-                     'values from [0, 255] to [-1.0, 1.0]. With this option, '
-                     'each of the three channels (red, green, blue) have the '
-                     'average channel value among all image subtracted from '
-                     'it, and no division is done.')
-
-flags.DEFINE_boolean('datasets_use_prefetch', True,
-                     'Enable use of prefetched datasets for input pipeline. '
-                     'This option is meaningless if use_datasets=False.')
-flags.DEFINE_integer('datasets_prefetch_buffer_size', 1,
-                     'Prefetching op buffer size per compute device.')
-flags.DEFINE_integer('datasets_num_private_threads', None,
-                     'Number of threads for a private threadpool created for '
-                     'all datasets computation. By default, we pick an '
-                     'appropriate number. If set to 0, we use the default '
-                     'tf-Compute threads for dataset operations.')
-flags.DEFINE_boolean('datasets_use_caching', False,
-                     'Cache the compressed input data in memory. This improves '
-                     'the data input performance, at the cost of additional '
-                     'memory.')
-flags.DEFINE_integer('datasets_parallel_interleave_cycle_length', None,
-                     'Number of parallel file readers interleaving input data.')
-flags.DEFINE_boolean('datasets_sloppy_parallel_interleave', False,
-                     'Allow parallel interleave to depart from deterministic '
-                     'ordering, by temporarily skipping over files whose '
-                     'elements are not readily available. This can increase '
-                     'througput in particular in the presence of stragglers.')
-flags.DEFINE_integer('datasets_parallel_interleave_prefetch', None,
-                     'The number of input elements to fetch before they are '
-                     'needed for interleaving.')
-
-flags.DEFINE_integer(
-    'multi_device_iterator_max_buffer_size', 1,
-    'Configuration parameter for the MultiDeviceIterator that '
-    ' specifies the host side buffer size for each device.')
-
-# Performance tuning parameters.
-flags.DEFINE_boolean('winograd_nonfused', True,
-                     'Enable/disable using the Winograd non-fused algorithms.')
-flags.DEFINE_boolean(
-    'batchnorm_persistent', True,
-    'Enable/disable using the CUDNN_BATCHNORM_SPATIAL_PERSISTENT '
-    'mode for batchnorm.')
-flags.DEFINE_boolean('sync_on_finish', False,
-                     'Enable/disable whether the devices are synced after each '
-                     'step.')
-flags.DEFINE_boolean('staged_vars', False,
-                     'whether the variables are staged from the main '
-                     'computation')
-flags.DEFINE_boolean('force_gpu_compatible', False,
-                     'whether to enable force_gpu_compatible in GPU_Options')
-flags.DEFINE_boolean('allow_growth', None,
-                     'whether to enable allow_growth in GPU_Options')
-flags.DEFINE_boolean('xla', False, 'whether to enable XLA auto-jit compilation')
-flags.DEFINE_boolean('xla_compile', False,
-                     'Enable xla to compile the graph. Uncompilable ops will '
-                     'result in fatal errors.')
-flags.DEFINE_boolean('fuse_decode_and_crop', True,
-                     'Fuse decode_and_crop for image preprocessing.')
-flags.DEFINE_boolean('distort_color_in_yiq', True,
-                     'Distort color of input images in YIQ space.')
-flags.DEFINE_boolean('enable_optimizations', True,
-                     'Whether to enable grappler and other optimizations.')
-flags.DEFINE_string('rewriter_config', None,
-                    'Config for graph optimizers, described as a '
-                    'RewriterConfig proto buffer.')
-flags.DEFINE_enum('loss_type_to_report', 'total_loss',
-                  ('base_loss', 'total_loss'),
-                  'Which type of loss to output and to write summaries for. '
-                  'The total loss includes L2 loss while the base loss does '
-                  'not. Note that the total loss is always used while '
-                  'computing gradients during training if weight_decay > 0, '
-                  'but explicitly computing the total loss, instead of just '
-                  'computing its gradients, can have a performance impact.')
-flags.DEFINE_boolean('single_l2_loss_op', False,
-                     'If True, instead of using an L2 loss op per variable, '
-                     'concatenate the variables into a single tensor and do a '
-                     'single L2 loss on the concatenated tensor.')
-flags.DEFINE_boolean('use_resource_vars', False,
-                     'Use resource variables instead of normal variables. '
-                     'Resource variables are slower, but this option is useful '
-                     'for debugging their performance.')
-flags.DEFINE_boolean('compute_lr_on_cpu', False,
-                     'If True, do computations related to learning rate on the '
-                     'CPU instead of the GPU. This will significantly improve '
-                     'XLA performance in some cases.')
-flags.DEFINE_boolean('sparse_to_dense_grads', False,
-                     'If True, convert all sparse gradients to dense gradients '
-                     'before passing them to the optimizer to update '
-                     'variables. Only affects models with sparse gradients, '
-                     'which currently is only the NCF model.')
-# Performance tuning specific to MKL.
-flags.DEFINE_boolean('mkl', False, 'If true, set MKL environment variables.')
-flags.DEFINE_integer('kmp_blocktime', 0,
-                     'The time, in milliseconds, that a thread should wait, '
-                     'after completing the execution of a parallel region, '
-                     'before sleeping')
-flags.DEFINE_string('kmp_affinity', 'granularity=fine,verbose,compact,1,0',
-                    'Restricts execution of certain threads (virtual execution '
-                    'units) to a subset of the physical processing units in a '
-                    'multiprocessor computer.')
-flags.DEFINE_integer('kmp_settings', 1,
-                     'If set to 1, MKL settings will be printed.')
-
-# fp16 parameters. If use_fp16=False, no other fp16 parameters apply.
-flags.DEFINE_boolean('use_fp16', False,
-                     'Use 16-bit floats for certain tensors instead of 32-bit '
-                     'floats. This is currently experimental.')
-# TODO(reedwm): The default loss scale of 128 causes most models to diverge
-# on the second step with synthetic data. Changing the tf.set_random_seed
-# call to tf.set_random_seed(1235) or most other seed values causes the
-# issue not to occur.
-flags.DEFINE_float('fp16_loss_scale', None,
-                   'If fp16 is enabled, the loss is multiplied by this amount '
-                   'right before gradients are computed, then each gradient '
-                   'is divided by this amount. Mathematically, this has no '
-                   'effect, but it helps avoid fp16 underflow. Set to 1 to '
-                   'effectively disable. Ignored during eval.')
-flags.DEFINE_boolean('fp16_vars', False,
-                     'If fp16 is enabled, also use fp16 for variables. If '
-                     'False, the variables are stored in fp32 and casted to '
-                     'fp16 when retrieved.  Recommended to leave as False.')
-flags.DEFINE_boolean('fp16_enable_auto_loss_scale', False,
-                     'If True and use_fp16 is True, automatically adjust the '
-                     'loss scale during training.')
-flags.DEFINE_integer('fp16_inc_loss_scale_every_n', 1000,
-                     'If fp16 is enabled and fp16_enable_auto_loss_scale is '
-                     'True, increase the loss scale every n steps.')
-
-# The method for managing variables:
-#   parameter_server: variables are stored on a parameter server that holds
-#       the master copy of the variable. In local execution, a local device
-#       acts as the parameter server for each variable; in distributed
-#       execution, the parameter servers are separate processes in the
-#       cluster.
-#       For each step, each tower gets a copy of the variables from the
-#       parameter server, and sends its gradients to the param server.
-#   replicated: each GPU has its own copy of the variables. To apply
-#       gradients, an all_reduce algorithm or or regular cross-device
-#       aggregation is used to replicate the combined gradients to all
-#       towers (depending on all_reduce_spec parameter setting).
-#   independent: each GPU has its own copy of the variables, and gradients
-#       are not shared between towers. This can be used to check performance
-#       when no data is moved between GPUs.
-#   distributed_replicated: Distributed training only. Each GPU has a copy
-#       of the variables, and updates its copy after the parameter servers
-#       are all updated with the gradients from all servers. Only works with
-#       cross_replica_sync=true. Unlike 'replicated', currently never uses
-#       nccl all-reduce for replicating within a server.
-#   distributed_all_reduce: Distributed training where all replicas run
-#       in a single session, using all-reduce to mutally reduce the
-#       gradients.  Uses no parameter servers.  When there is only one
-#       worker, this is the same as replicated.
-#   collective_all_reduce: Distributed training where all replicas run
-#       independepently except for variable initialization and for
-#       gradient reduction which is done via collective all-reduce.
-#       NOTE: collective_all_reduce in conjunction with use_fp16 can
-#       lead to NaNs in some models (resnet50).  TODO(tucker): fix it.
-#   horovod: Distributed training using Horovod library. Runs workers using
-#       an MPI framework (e.g. Open MPI). Each worker runs training on
-#       single GPU, and averages gradients using NCCL or MPI all-reduce.
-#       See https://github.com/uber/horovod for more details.
-flags.DEFINE_enum('variable_update', 'parameter_server',
-                  ('parameter_server', 'replicated', 'distributed_replicated',
-                   'independent', 'distributed_all_reduce',
-                   'collective_all_reduce', 'horovod'),
-                  'The method for managing variables: parameter_server, '
-                  'replicated, distributed_replicated, independent, '
-                  'distributed_all_reduce, collective_all_reduce, horovod')
-flags.DEFINE_string('all_reduce_spec', None,
-                    'A specification of the all_reduce algorithm to be used '
-                    'for reducing gradients.  For more details, see '
-                    'parse_all_reduce_spec in variable_mgr.py.  An '
-                    'all_reduce_spec has BNF form:\n'
-                    'int ::= positive whole number\n'
-                    'g_int ::= int[KkMGT]?\n'
-                    'alg_spec ::= alg | alg#int\n'
-                    'range_spec ::= alg_spec | alg_spec/alg_spec\n'
-                    'spec ::= range_spec | range_spec:g_int:range_spec\n'
-                    'NOTE: not all syntactically correct constructs are '
-                    'supported.\n\n'
-                    'Examples:\n '
-                    '"xring" == use one global ring reduction for all '
-                    'tensors\n'
-                    '"pscpu" == use CPU at worker 0 to reduce all tensors\n'
-                    '"nccl" == use NCCL to locally reduce all tensors.  '
-                    'Limited to 1 worker.\n'
-                    '"nccl/xring" == locally (to one worker) reduce values '
-                    'using NCCL then ring reduce across workers.\n'
-                    '"pscpu:32k:xring" == use pscpu algorithm for tensors of '
-                    'size up to 32kB, then xring for larger tensors.')
-
-# If variable_update==distributed_all_reduce then it may be advantageous
-# to aggregate small tensors into one prior to reduction.  These parameters
-# control that aggregation.
-flags.DEFINE_integer('agg_small_grads_max_bytes', 0,
-                     'If > 0, try to aggregate tensors of less than this '
-                     'number of bytes prior to all-reduce.')
-flags.DEFINE_integer('agg_small_grads_max_group', 10,
-                     'When aggregating small tensors for all-reduce do not '
-                     'aggregate more than this many into one new tensor.')
-flags.DEFINE_integer('allreduce_merge_scope', 1,
-                     'Establish a name scope around this many '
-                     'gradients prior to creating the all-reduce operations. '
-                     'It may affect the ability of the backend to merge '
-                     'parallel ops.')
-
-# Distributed training parameters.
-flags.DEFINE_enum('job_name', '', ('ps', 'worker', 'controller', ''),
-                  'One of "ps", "worker", "controller", "".  Empty for local '
-                  'training')
-flags.DEFINE_string('ps_hosts', '', 'Comma-separated list of target hosts')
-flags.DEFINE_string('worker_hosts', '', 'Comma-separated list of target hosts')
-flags.DEFINE_string('controller_host', None, 'optional controller host')
-flags.DEFINE_integer('task_index', 0, 'Index of task within the job')
-flags.DEFINE_string('server_protocol', 'grpc', 'protocol for servers')
-flags.DEFINE_boolean('cross_replica_sync', True, '')
-flags.DEFINE_string('horovod_device', '', 'Device to do Horovod all-reduce on: '
-                    'empty (default), cpu or gpu. Default with utilize GPU if '
-                    'Horovod was compiled with the HOROVOD_GPU_ALLREDUCE '
-                    'option, and CPU otherwise.')
-
-# Summary and Save & load checkpoints.
-flags.DEFINE_integer('summary_verbosity', 0, 'Verbosity level for summary ops. '
-                     'level 0: disable any summary.\n'
-                     'level 1: small and fast ops, e.g.: learning_rate, '
-                     'total_loss.\n'
-                     'level 2: medium-cost ops, e.g. histogram of all '
-                     'gradients.\n'
-                     'level 3: expensive ops: images and histogram of each '
-                     'gradient.\n')
-flags.DEFINE_integer('save_summaries_steps', 0,
-                     'How often to save summaries for trained models. Pass 0 '
-                     'to disable summaries.')
-flags.DEFINE_integer('save_model_secs', 0,
-                     'How often to save trained models. Pass 0 to disable '
-                     'saving checkpoints every N seconds. A checkpoint is '
-                     'saved after training completes regardless of this '
-                     'option.')
-flags.DEFINE_integer('save_model_steps', None,
-                     'How often to save trained models. If specified, '
-                     'save_model_secs must not be specified.')
-flags.DEFINE_integer('max_ckpts_to_keep', 5,
-                     'Max number of checkpoints to keep.')
-flags.DEFINE_string('train_dir', None,
-                    'Path to session checkpoints. Pass None to disable saving '
-                    'checkpoint at the end.')
-flags.DEFINE_string('eval_dir', '/tmp/tf_cnn_benchmarks/eval',
-                    'Directory where to write eval event logs.')
-flags.DEFINE_string('backbone_model_path', None,
-                    'Path to pretrained backbone model checkpoint. Pass None '
-                    'if not using a backbone model.')
-flags.DEFINE_enum('trt_mode', '', ['', 'FP32', 'FP16', 'INT8'],
-                  'If this is specified in forward_only mode and '
-                  'freeze_when_forward_only is set to True, use TensorRT to '
-                  'optimize the graph before execution.')
-flags.DEFINE_integer('trt_max_workspace_size_bytes', 4 << 30,
-                     'Max workspace size bytes used by the TensorRT optimizer.')
-
-# Benchmark logging for model garden metric
-flags.DEFINE_string('benchmark_log_dir', None,
-                    'The directory to place the log files containing the '
-                    'results of benchmark. The logs are created by '
-                    'BenchmarkFileLogger. Requires the root of the Tensorflow '
-                    'models repository to be in $PYTHTONPATH.')
-flags.DEFINE_string('benchmark_test_id', None,
-                    'The unique test ID of the benchmark run. It could be the '
-                    'combination of key parameters. It is hardware independent '
-                    'and could be used compare the performance between '
-                    'different test runs. This flag is designed for human '
-                    'consumption, and does not have any impact within the '
-                    'system.')
-
-platforms_util.define_platform_params()
-
-
-class GlobalStepWatcher(threading.Thread):
-  """A helper class for global_step.
-
-  Polls for changes in the global_step of the model, and finishes when the
-  number of steps for the global run are done.
-  """
-
-  def __init__(self, sess, global_step_op, start_at_global_step,
-               end_at_global_step):
-    threading.Thread.__init__(self)
-    self.sess = sess
-    self.global_step_op = global_step_op
-    self.start_at_global_step = start_at_global_step
-    self.end_at_global_step = end_at_global_step
-
-    self.start_time = 0
-    self.start_step = 0
-    self.finish_time = 0
-    self.finish_step = 0
-
-  def run(self):
-    while self.finish_time == 0:
-      time.sleep(.25)
-      global_step_val, = self.sess.run([self.global_step_op])
-      if self.start_time == 0 and global_step_val >= self.start_at_global_step:
-        # Use tf.logging.info instead of log_fn, since print (which is log_fn)
-        # is not thread safe and may interleave the outputs from two parallel
-        # calls to print, which can break tests.
-        tf.logging.info('Starting real work at step %s at time %s' %
-                        (global_step_val, time.ctime()))
-        self.start_time = time.perf_counter()
-        self.start_step = global_step_val
-      if self.finish_time == 0 and global_step_val >= self.end_at_global_step:
-        tf.logging.info('Finishing real work at step %s at time %s' %
-                        (global_step_val, time.ctime()))
-        self.finish_time = time.perf_counter()
-        self.finish_step = global_step_val
-
-  def done(self):
-    return self.finish_time > 0
-
-  def num_steps(self):
-    return self.finish_step - self.start_step
-
-  def elapsed_time(self):
-    return self.finish_time - self.start_time
-
-
-class CheckpointNotFoundException(Exception):
-  pass
-
-
-def create_config_proto(params):
-  """Returns session config proto.
-
-  Args:
-    params: Params tuple, typically created by make_params or
-            make_params_from_flags.
-  """
-  config = tf.ConfigProto()
-  config.allow_soft_placement = True
-  if params.num_intra_threads is None:
-    if params.device == 'gpu':
-      config.intra_op_parallelism_threads = 1
-  else:
-    config.intra_op_parallelism_threads = params.num_intra_threads
-  config.inter_op_parallelism_threads = params.num_inter_threads
-  config.experimental.collective_group_leader = '/job:worker/replica:0/task:0'
-  config.gpu_options.experimental.collective_ring_order = params.gpu_indices
-  config.gpu_options.force_gpu_compatible = params.force_gpu_compatible
-  config.experimental.use_numa_affinity = params.use_numa_affinity
-  if params.device == 'cpu':
-    # TODO(tucker): change num_gpus to num_devices
-    config.device_count['CPU'] = params.num_gpus
-  if params.allow_growth is not None:
-    config.gpu_options.allow_growth = params.allow_growth
-  if params.gpu_memory_frac_for_testing > 0:
-    config.gpu_options.per_process_gpu_memory_fraction = (
-        params.gpu_memory_frac_for_testing)
-  if params.use_unified_memory:
-    config.gpu_options.experimental.use_unified_memory = (
-        params.use_unified_memory)
-  if params.timestamped_allocator:
-    config.gpu_options.experimental.timestamped_allocator = (
-        params.timestamped_allocator)
-  if params.gpu_kt_max_interval > 0:
-    config.gpu_options.experimental.kernel_tracker_max_interval = (
-        params.gpu_kt_max_interval)
-  if params.gpu_kt_max_bytes > 0:
-    config.gpu_options.experimental.kernel_tracker_max_bytes = (
-        params.gpu_kt_max_bytes)
-  if params.gpu_kt_max_pending > 0:
-    config.gpu_options.experimental.kernel_tracker_max_pending = (
-        params.gpu_kt_max_pending)
-  if params.xla:
-    config.graph_options.optimizer_options.global_jit_level = (
-        tf.OptimizerOptions.ON_1)
-  if params.rewriter_config:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
-    text_format.Merge(params.rewriter_config, rewriter_config)
-    config.graph_options.rewrite_options.CopyFrom(rewriter_config)
-  elif not params.enable_optimizations:
-    config.graph_options.optimizer_options.opt_level = tf.OptimizerOptions.L0
-    config.graph_options.rewrite_options.disable_meta_optimizer = True
-  elif params.variable_update == 'collective_all_reduce':
-    rewrite_options = config.graph_options.rewrite_options
-    rewrite_options.scoped_allocator_optimization = (
-        rewriter_config_pb2.RewriterConfig.ON)
-    rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
-  if params.variable_update == 'horovod':
-    import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-    config.gpu_options.visible_device_list = str(hvd.local_rank())
-  # For collective_all_reduce, ignore all devices except current worker.
-  if params.variable_update == 'collective_all_reduce':
-    del config.device_filters[:]
-    config.device_filters.append(
-        '/job:%s/replica:0/task:%d' % (params.job_name, params.task_index))
-
-  # TODO(b/117324590): Re-enable PinToHostOptimizer when b/117324590 is fixed.
-  # Currently we have to disable PinToHostOptimizer w/ XLA since it causes
-  # OOM/perf cliffs.
-  config.graph_options.rewrite_options.pin_to_host_optimization = (
-      rewriter_config_pb2.RewriterConfig.OFF)
-  return config
-
-
-def get_mode_from_params(params):
-  """Returns the mode in which this script is running.
-
-  Args:
-    params: Params tuple, typically created by make_params or
-            make_params_from_flags.
-  Raises:
-    ValueError: Unsupported params settings.
-  """
-  if params.forward_only and params.eval:
-    raise ValueError('Only one of forward_only and eval parameters is true')
-
-  if params.eval:
-    return constants.BenchmarkMode.EVAL
-  elif params.forward_only:
-    return constants.BenchmarkMode.FORWARD_ONLY
-  elif (params.eval_during_training_every_n_steps or
-        params.eval_during_training_every_n_epochs or
-        params.eval_during_training_at_specified_steps or
-        params.eval_during_training_at_specified_epochs):
-    return constants.BenchmarkMode.TRAIN_AND_EVAL
-  else:
-    return constants.BenchmarkMode.TRAIN
-
-
-# How many digits to show for the loss and accuracies during training.
-LOSS_AND_ACCURACY_DIGITS_TO_SHOW = 3
-
-
-def benchmark_one_step(sess,
-                       fetches,
-                       step,
-                       batch_size,
-                       step_train_times,
-                       trace_filename,
-                       partitioned_graph_file_prefix,
-                       profiler,
-                       image_producer,
-                       params,
-                       summary_op=None,
-                       show_images_per_sec=True,
-                       benchmark_logger=None,
-                       collective_graph_key=0,
-                       should_output_files=True):
-  """Advance one step of benchmarking."""
-  should_profile = profiler and 0 <= step < _NUM_STEPS_TO_PROFILE
-  need_options_and_metadata = (
-      should_profile or collective_graph_key > 0 or
-      ((trace_filename or partitioned_graph_file_prefix) and step == -2)
-  )
-  if need_options_and_metadata:
-    run_options = tf.RunOptions()
-    if (trace_filename and step == -2) or should_profile:
-      run_options.trace_level = tf.RunOptions.FULL_TRACE
-    if partitioned_graph_file_prefix and step == -2:
-      run_options.output_partition_graphs = True
-    if collective_graph_key > 0:
-      run_options.experimental.collective_graph_key = collective_graph_key
-    run_metadata = tf.RunMetadata()
-  else:
-    run_options = None
-    run_metadata = None
-  summary_str = None
-  start_time = time.perf_counter()
-  if summary_op is None:
-    results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
-  else:
-    (results, summary_str) = sess.run(
-        [fetches, summary_op], options=run_options, run_metadata=run_metadata)
-
-  if not params.forward_only:
-    lossval = results['average_loss']
-  else:
-    lossval = 0.
-  if image_producer is not None:
-    image_producer.notify_image_consumption()
-  train_time = time.perf_counter() - start_time
-  step_train_times.append(train_time)
-  if (show_images_per_sec and step >= 0 and
-      (step == 0 or (step + 1) % params.display_every == 0)):
-    speed_mean, speed_uncertainty, speed_jitter = get_perf_timing(
-        batch_size, step_train_times, params.display_perf_ewma)
-    log_str = '%i\t%s\t%.*f' % (
-        step + 1,
-        get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter),
-        LOSS_AND_ACCURACY_DIGITS_TO_SHOW, lossval)
-    if 'top_1_accuracy' in results:
-      log_str += '\t%.*f\t%.*f' % (
-          LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_1_accuracy'],
-          LOSS_AND_ACCURACY_DIGITS_TO_SHOW, results['top_5_accuracy'])
-    log_fn(log_str)
-    if benchmark_logger:
-      benchmark_logger.log_metric(
-          'current_examples_per_sec', speed_mean, global_step=step + 1)
-      if 'top_1_accuracy' in results:
-        benchmark_logger.log_metric(
-            'top_1_accuracy', results['top_1_accuracy'], global_step=step + 1)
-        benchmark_logger.log_metric(
-            'top_5_accuracy', results['top_5_accuracy'], global_step=step + 1)
-  if need_options_and_metadata:
-    if should_profile:
-      profiler.add_step(step, run_metadata)
-    if trace_filename and step == -2 and should_output_files:
-      log_fn('Dumping trace to %s' % trace_filename)
-      trace_dir = os.path.dirname(trace_filename)
-      if not gfile.Exists(trace_dir):
-        gfile.MakeDirs(trace_dir)
-      with gfile.Open(trace_filename, 'w') as trace_file:
-        if params.use_chrome_trace_format:
-          trace = timeline.Timeline(step_stats=run_metadata.step_stats)
-          trace_file.write(trace.generate_chrome_trace_format(show_memory=True))
-        else:
-          trace_file.write(str(run_metadata.step_stats))
-    if partitioned_graph_file_prefix and step == -2 and should_output_files:
-      path, filename = os.path.split(partitioned_graph_file_prefix)
-      if '.' in filename:
-        base_filename, ext = filename.rsplit('.', 1)
-        ext = '.' + ext
-      else:
-        base_filename, ext = filename, ''
-      as_text = filename.endswith('txt')
-      for graph_def in run_metadata.partition_graphs:
-        device = graph_def.node[0].device.replace('/', '_').replace(':', '_')
-        graph_filename = '%s%s%s' % (base_filename, device, ext)
-        log_fn('Writing partitioned GraphDef as %s to %s' % (
-            'text' if as_text else 'binary',
-            os.path.join(path, graph_filename)))
-        tf.train.write_graph(graph_def, path, graph_filename, as_text)
-  return (summary_str, lossval)
-
-
-def get_perf_timing_str(speed_mean, speed_uncertainty, speed_jitter, scale=1):
-  if scale == 1:
-    # TODO(laigd): rename 'images' to maybe 'inputs', same below.
-    return ('images/sec: %.1f +/- %.1f (jitter = %.1f)' %
-            (speed_mean, speed_uncertainty, speed_jitter))
-  else:
-    return 'images/sec: %.1f' % speed_mean
-
-
-def get_perf_timing(batch_size, step_train_times, ewma_alpha=None, scale=1):
-  """Calculate benchmark processing speed."""
-  times = np.array(step_train_times)
-  speeds = batch_size / times
-  if ewma_alpha:
-    weights = np.logspace(len(times)-1, 0, len(times), base=1-ewma_alpha)
-    time_mean = np.average(times, weights=weights)
-  else:
-    time_mean = np.mean(times)
-  speed_mean = scale * batch_size / time_mean
-  speed_uncertainty = np.std(speeds) / np.sqrt(float(len(speeds)))
-  speed_jitter = 1.4826 * np.median(np.abs(speeds - np.median(speeds)))
-  return speed_mean, speed_uncertainty, speed_jitter
-
-
-def load_checkpoint(saver, sess, ckpt_dir):
-  """Loads checkpoint from provided directory or full path.
-
-  Args:
-    saver: Saver used to restore the checkpoint.
-    sess: TensorFlow session.
-    ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
-
-  Returns:
-    Global step.
-  """
-  model_checkpoint_path = _get_checkpoint_to_load(ckpt_dir)
-  global_step = model_checkpoint_path.split('/')[-1].split('-')[-1]
-  if not global_step.isdigit():
-    global_step = 0
-  else:
-    global_step = int(global_step)
-  saver.restore(sess, model_checkpoint_path)
-  log_fn('Successfully loaded model from %s.' % model_checkpoint_path)
-  return global_step
-
-
-def _get_checkpoint_to_load(ckpt_dir):
-  """Returns which checkpoint to load.
-
-  Args:
-    ckpt_dir: Path to a folder of checkpoints or full path to a checkpoint.
-
-  Returns:
-    Full path to checkpoint to load.
-
-  Raises:
-    CheckpointNotFoundException: If checkpoint is not found.
-  """
-  p = re.compile(r'ckpt-\d+$')
-  if p.search(ckpt_dir):
-    model_checkpoint_path = ckpt_dir
-  else:
-    # Finds latest checkpoint in directory provided
-    ckpt = tf.train.get_checkpoint_state(ckpt_dir)
-    if ckpt and ckpt.model_checkpoint_path:
-      model_checkpoint_path = ckpt.model_checkpoint_path
-    else:
-      raise CheckpointNotFoundException('No checkpoint file found in dir:{}'.
-                                        format(ckpt_dir))
-  return model_checkpoint_path
-
-
-# Params are passed to BenchmarkCNN's constructor. Params is a map from name
-# to value, with one field per key in flags.param_specs.
-#
-# Call make_params() or make_params_from_flags() below to construct a Params
-# tuple with default values from flags.param_specs, rather than constructing
-# Params directly.
-Params = namedtuple('Params', flags.param_specs.keys())  # pylint: disable=invalid-name
-
-
-def validate_params(params):
-  """Validates that the Params tuple had valid values.
-
-  When command-line flags are defined for each ParamSpec by calling
-  flags.define_flags(), calling this function is unnecessary because absl
-  already does flag validation. Otherwise, this function should be called.
-
-  Args:
-     params: A Params tuple.
-  Raises:
-    ValueError: An element of params had an invalid value.
-  """
-  for name, value in params._asdict().items():
-    param_spec = flags.param_specs[name]
-    if param_spec.flag_type in ('integer', 'float'):
-      if (value is not None and param_spec.kwargs['lower_bound'] is not None and
-          value < param_spec.kwargs['lower_bound']):
-        raise ValueError('Param %s value of %s is lower than the lower bound '
-                         'of %s' %
-                         (name, value, param_spec.kwargs['lower_bound']))
-      if (value is not None and param_spec.kwargs['upper_bound'] is not None and
-          param_spec.kwargs['upper_bound'] < value):
-        raise ValueError('Param %s value of %s is higher than the upper bound '
-                         'of %s' %
-                         (name, value, param_spec.kwargs['upper_bound']))
-    elif (value is not None and param_spec.flag_type == 'enum' and
-          value not in param_spec.kwargs['enum_values']):
-      raise ValueError('Param %s of value %s is not in %s'%
-                       (name, value, param_spec.kwargs['enum_values']))
-
-
-def make_params(**kwargs):
-  """Create a Params tuple for BenchmarkCNN from kwargs.
-
-  Default values are filled in from flags.param_specs.
-
-  Args:
-    **kwargs: kwarg values will override the default values.
-  Returns:
-    Params namedtuple for constructing BenchmarkCNN.
-  """
-  # Create a (name: default_value) map from flags.param_specs.
-  default_kwargs = {
-      name: flags.param_specs[name].default_value
-      for name in flags.param_specs
-  }
-  params = Params(**default_kwargs)._replace(**kwargs)
-  validate_params(params)
-  return params
-
-
-def make_params_from_flags():
-  """Create a Params tuple for BenchmarkCNN from absl_flags.FLAGS.
-
-  Returns:
-    Params namedtuple for constructing BenchmarkCNN.
-  """
-  # Collect (name: value) pairs for absl_flags.FLAGS with matching names in
-  # flags.param_specs.
-  flag_values = {name: getattr(absl_flags.FLAGS, name)
-                 for name in flags.param_specs.keys()}
-  return Params(**flag_values)
-
-
-def remove_param_fields(params, fields_to_remove):
-  """Remove fields from a Params namedtuple."""
-  params_dict = params._asdict()
-  for field in fields_to_remove:
-    assert field in params_dict, 'Invalid Params field: ' + field
-  params_dict = {k: v for k, v in params_dict.items()
-                 if k not in fields_to_remove}
-  new_params_type = namedtuple('Params', params_dict.keys())
-  return new_params_type(**params_dict)
-
-
-def get_num_batches_and_epochs(params, batch_size, num_examples_per_epoch):
-  """Returns the number of batches and epochs to run for.
-
-  Args:
-    params: Params tuple, typically created by make_params or
-      make_params_from_flags.
-    batch_size: The number of images per step.
-    num_examples_per_epoch: The number of images in a single epoch.
-
-  Returns:
-    num_batches: The number of batches to run for.
-    num_epochs: The number of epochs to run for. This might be slightly
-      smaller than params.num_epochs if specified, because the number of batches
-      must be an integer.
-
-  Raises:
-    ValueError: Invalid or unsupported params.
-  """
-  if params.num_batches and params.num_epochs:
-    raise ValueError('At most one of --num_batches and --num_epochs may be '
-                     'specified.')
-  if params.num_epochs:
-    num_batches = int(params.num_epochs * num_examples_per_epoch +
-                      batch_size - 1) // batch_size
-  else:
-    num_batches = params.num_batches or _DEFAULT_NUM_BATCHES
-  num_epochs = num_batches * batch_size / num_examples_per_epoch
-  return (num_batches, num_epochs)
-
-
-def get_piecewise_learning_rate(piecewise_learning_rate_schedule,
-                                global_step, num_batches_per_epoch):
-  """Returns a piecewise learning rate tensor.
-
-  Args:
-    piecewise_learning_rate_schedule: The --piecewise_learning_rate_schedule
-      parameter
-    global_step: Scalar tensor representing the global step.
-    num_batches_per_epoch: float indicating the number of batches per epoch.
-
-  Returns:
-    A scalar float tensor, representing the learning rate.
-
-  Raises:
-    ValueError: piecewise_learning_rate_schedule is not formatted correctly.
-  """
-  pieces = piecewise_learning_rate_schedule.split(';')
-  if len(pieces) % 2 == 0:
-    raise ValueError('--piecewise_learning_rate_schedule must have an odd '
-                     'number of components')
-  values = []
-  boundaries = []
-  for i, piece in enumerate(pieces):
-    if i % 2 == 0:
-      try:
-        values.append(float(piece))
-      except ValueError:
-        raise ValueError('Invalid learning rate: ' + piece)
-    else:
-      try:
-        boundaries.append(int(int(piece) * num_batches_per_epoch) - 1)
-      except ValueError:
-        raise ValueError('Invalid epoch: ' + piece)
-  return tf.train.piecewise_constant(global_step, boundaries, values,
-                                     name='piecewise_learning_rate')
-
-
-def get_learning_rate(params, global_step, num_examples_per_epoch, model,
-                      batch_size):
-  """Returns a learning rate tensor based on global_step.
-
-  Args:
-    params: Params tuple, typically created by make_params or
-      make_params_from_flags.
-    global_step: Scalar tensor representing the global step.
-    num_examples_per_epoch: The number of examples per epoch.
-    model: The model.Model object to obtain the default learning rate from if no
-      learning rate is specified.
-    batch_size: Number of examples per step
-
-  Returns:
-    A scalar float tensor, representing the learning rate. When evaluated, the
-    learning rate depends on the current value of global_step.
-
-  Raises:
-    ValueError: Invalid or unsupported params.
-  """
-  with tf.name_scope('learning_rate'):
-    num_batches_per_epoch = num_examples_per_epoch / batch_size
-
-    if params.piecewise_learning_rate_schedule:
-      if (params.init_learning_rate is not None or
-          params.learning_rate_decay_factor or
-          params.minimum_learning_rate or params.num_epochs_per_decay):
-        raise ValueError('No other learning rate-related flags can be '
-                         'specified if --piecewise_learning_rate_schedule is '
-                         'specified')
-      learning_rate = get_piecewise_learning_rate(
-          params.piecewise_learning_rate_schedule,
-          global_step, num_batches_per_epoch)
-    elif params.init_learning_rate is not None:
-      learning_rate = params.init_learning_rate
-      if (params.num_epochs_per_decay > 0 and
-          params.learning_rate_decay_factor > 0):
-        decay_steps = int(num_batches_per_epoch * params.num_epochs_per_decay)
-
-        # Decay the learning rate exponentially based on the number of steps.
-        learning_rate = tf.train.exponential_decay(
-            params.init_learning_rate,
-            global_step,
-            decay_steps,
-            params.learning_rate_decay_factor,
-            staircase=True)
-
-        if params.minimum_learning_rate != 0.:
-          learning_rate = tf.maximum(learning_rate,
-                                     params.minimum_learning_rate)
-    else:
-      learning_rate = model.get_learning_rate(global_step, batch_size)
-    if params.num_learning_rate_warmup_epochs > 0 and (
-        params.init_learning_rate is not None or
-        params.piecewise_learning_rate_schedule):
-      warmup_steps = int(num_batches_per_epoch *
-                         params.num_learning_rate_warmup_epochs)
-      init_lr = params.init_learning_rate
-      if init_lr is None:
-        init_lr = float(params.piecewise_learning_rate_schedule.split(';')[0])
-      warmup_lr = init_lr * tf.cast(global_step, tf.float32) / tf.cast(
-          warmup_steps, tf.float32)
-      learning_rate = tf.cond(global_step < warmup_steps,
-                              lambda: warmup_lr, lambda: learning_rate)
-
-    learning_rate = mlperf.logger.log_deferred_tensor_value(
-        mlperf.tags.OPT_LR, learning_rate, global_step, every_n=100)
-  return learning_rate
-
-
-def get_optimizer(params, learning_rate):
-  """Returns the optimizer that should be used based on params."""
-  if params.optimizer == 'momentum':
-    mlperf.logger.log(key=mlperf.tags.OPT_NAME,
-                      value=mlperf.tags.SGD_WITH_MOMENTUM)
-    mlperf.logger.log(key=mlperf.tags.OPT_MOMENTUM, value=params.momentum)
-    opt = tf.train.MomentumOptimizer(
-        learning_rate, params.momentum, use_nesterov=True)
-  elif params.optimizer == 'sgd':
-    mlperf.logger.log(key=mlperf.tags.OPT_NAME, value=mlperf.tags.SGD)
-    opt = tf.train.GradientDescentOptimizer(learning_rate)
-  elif params.optimizer == 'rmsprop':
-    opt = tf.train.RMSPropOptimizer(
-        learning_rate,
-        params.rmsprop_decay,
-        momentum=params.rmsprop_momentum,
-        epsilon=params.rmsprop_epsilon)
-  elif params.optimizer == 'adam':
-    opt = tf.train.AdamOptimizer(learning_rate, params.adam_beta1,
-                                 params.adam_beta2, params.adam_epsilon)
-  else:
-    raise ValueError('Optimizer "{}" was not recognized'.
-                     format(params.optimizer))
-  return opt
-
-
-def generate_tfprof_profile(profiler, tfprof_file):
-  """Generates a tfprof profile, writing it to a file and printing top ops.
-
-  Args:
-    profiler: A tf.profiler.Profiler. `profiler.add_step` must have already been
-      called.
-    tfprof_file: The filename to write the ProfileProto to.
-  """
-  profile_proto = profiler.serialize_to_string()
-  log_fn('Dumping ProfileProto to %s' % tfprof_file)
-  with gfile.Open(tfprof_file, 'wb') as f:
-    f.write(profile_proto)
-
-  # Print out the execution times of the top operations. Note this
-  # information can also be obtained with the dumped ProfileProto, but
-  # printing it means tfprof doesn't have to be used if all the user wants
-  # is the top ops.
-  options = tf.profiler.ProfileOptionBuilder.time_and_memory()
-  options['max_depth'] = _NUM_OPS_TO_PRINT
-  options['order_by'] = 'accelerator_micros'
-  profiler.profile_operations(options)
-
-
-class BenchmarkCNN(object):
-  """Class for benchmarking a cnn network."""
-
-  def __init__(self, params, dataset=None, model=None):
-    """Initialize BenchmarkCNN.
-
-    Args:
-      params: Params tuple, typically created by make_params or
-              make_params_from_flags.
-      dataset: If not None, the dataset to use. Otherwise, params is used to
-               obtain the dataset.
-      model: If not None, the model to use. Otherwise, params is used to obtain
-             the model.
-    Raises:
-      ValueError: Unsupported params settings.
-    """
-    mlperf.logger.log(key=mlperf.tags.RUN_START)
-    self.params = params
-    if params.eval:
-      self._doing_eval = True
-    else:
-      # Note self._doing_eval can later switch to True in self._do_eval() if
-      # self.params.eval_during_training_* is specified.
-      self._doing_eval = False
-    self.dataset = dataset or datasets.create_dataset(self.params.data_dir,
-                                                      self.params.data_name)
-    self.model = model or model_config.get_model_config(
-        self.params.model, self.dataset, self.params)
-    self.trace_filename = self.params.trace_file
-    self.rewriter_config = self.params.rewriter_config
-    autotune_threshold = self.params.autotune_threshold if (
-        self.params.autotune_threshold) else 1
-    min_autotune_warmup = 5 * autotune_threshold * autotune_threshold
-    self.num_warmup_batches = self.params.num_warmup_batches if (
-        self.params.num_warmup_batches is not None) else max(
-            10, min_autotune_warmup)
-    self.graph_file = self.params.graph_file
-    self.resize_method = self.params.resize_method
-    self.sync_queue_counter = 0
-    self.num_gpus = self.params.num_gpus
-    if self.params.gpu_indices:
-      self.gpu_indices = [int(x) for x in self.params.gpu_indices.split(',')]
-    else:
-      self.gpu_indices = [x for x in range(self.num_gpus)]
-
-    if (self.params.device == 'cpu' and self.params.data_format == 'NCHW' and
-        not self.params.mkl):
-      raise ValueError('device=cpu requires that data_format=NHWC')
-
-    if ((self.params.num_epochs_per_decay or
-         self.params.learning_rate_decay_factor) and
-        not (self.params.init_learning_rate is not None and
-             self.params.num_epochs_per_decay
-             and self.params.learning_rate_decay_factor)):
-      raise ValueError('If one of num_epochs_per_decay or '
-                       'learning_rate_decay_factor is set, both must be set'
-                       'and learning_rate must be set')
-    if (self.params.minimum_learning_rate and
-        not (self.params.init_learning_rate is not None and
-             self.params.num_epochs_per_decay and
-             self.params.learning_rate_decay_factor)):
-      raise ValueError('minimum_learning_rate requires learning_rate,'
-                       'num_epochs_per_decay, and '
-                       'learning_rate_decay_factor to be set')
-
-    if (self.params.use_fp16 and self.params.fp16_vars and
-        'replicated' in self.params.variable_update and
-        self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec):
-      raise ValueError('fp16 variables are not supported with NCCL')
-    if (self.params.use_fp16 and self.params.fp16_vars and
-        self.params.gradient_repacking):
-      raise ValueError('--fp16_vars cannot be used with --gradient_repacking')
-
-    if self.params.variable_update == 'horovod' and self.params.num_gpus > 1:
-      raise ValueError('Horovod benchmarks require num_gpus=1 on each worker')
-
-    if self.params.variable_update == 'horovod' and self.params.job_name:
-      raise ValueError('job_name should not be specified for Horovod.')
-
-    if self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale:
-      if self.params.all_reduce_spec and 'nccl' in self.params.all_reduce_spec:
-        raise ValueError('Automatic loss scaling is not supported with NCCL.')
-      if self.params.variable_update not in ('parameter_server', 'replicated',
-                                             'independent'):
-        raise ValueError('Automatic loss scaling is not supported with '
-                         'variable_update=%s.' % self.params.variable_update)
-      if self.params.staged_vars:
-        raise ValueError('Automatic loss scaling is not supported with'
-                         'staged_vars.')
-
-    if (self.params.debugger is not None and self.params.debugger != 'cli' and
-        ':' not in self.params.debugger):
-      raise ValueError('--debugger must be "cli" or in the form '
-                       'host:port')
-
-    if self.params.hierarchical_copy and self.params.num_gpus <= 1:
-      raise ValueError('--hierarchical_copy requires --num_gpus to be greater '
-                       'than 1')
-
-    if params.save_model_secs and params.save_model_steps:
-      raise ValueError('At most one of --save_model_secs and '
-                       '--save_model_steps can be specified')
-
-    eval_during_training_flags = list(map(bool, [
-        params.eval_during_training_every_n_steps,
-        params.eval_during_training_every_n_epochs,
-        params.eval_during_training_at_specified_steps,
-        params.eval_during_training_at_specified_epochs,
-    ]))
-
-    if eval_during_training_flags.count(True) > 1:
-      raise ValueError('At most one flag with --eval_during_training_* prefix '
-                       'must be specified.')
-
-    eval_during_training_enabled = any(eval_during_training_flags)
-
-    if eval_during_training_enabled:
-      if params.eval:
-        raise ValueError('At most one of --eval and --eval_during_training_* '
-                         'must be specified')
-      if params.forward_only:
-        raise ValueError('At most one of --forward_only and '
-                         '--eval_during_training_* must be specified')
-      if params.job_name:
-        raise ValueError('--eval_during_training_* is not yet supported in '
-                         'distributed mode.')
-      if params.staged_vars:
-        raise ValueError('--eval_during_training_* is not currently compatible '
-                         'with --staged_vars')
-
-    if params.stop_at_top_1_accuracy and not eval_during_training_enabled:
-      raise ValueError('--stop_at_top_1_accuracy is only supported with '
-                       '--eval_during_training_*')
-    if params.collect_eval_results_async and params.model != 'ssd300':
-      raise ValueError('--collect_eval_results_async only works with ssd300 '
-                       'model currently.')
-    if self.params.forward_only and self.params.freeze_when_forward_only:
-      if self.params.train_dir is not None:
-        raise ValueError('In forward_only mode, when --freeze_when_forward_only'
-                         ' is True, --train_dir should not be specified')
-      if self.params.data_dir and not self.params.datasets_use_prefetch:
-        raise ValueError('In forward_only mode, when --freeze_when_forward_only'
-                         ' is True and --data_dir is set, '
-                         '--datasets_use_prefetch should be set to True')
-      if self.params.job_name:
-        raise ValueError('In forward_only mode, when --freeze_when_forward_only'
-                         ' is True, --job_name should not be specified and '
-                         'distributed running is not supported')
-      self.forward_only_and_freeze = True
-    else:
-      self.forward_only_and_freeze = False
-      if self.params.trt_mode:
-        raise ValueError('--trt_mode should not be specified if one of '
-                         '--forward_only and --freeze_when_forward_only is set '
-                         'to False')
-
-    self.mode = get_mode_from_params(self.params)
-
-    # Use the batch size from the command line if specified, otherwise use the
-    # model's default batch size.  Scale the benchmark's batch size by the
-    # number of GPUs.
-    if self.params.batch_size > 0:
-      self.model.set_batch_size(self.params.batch_size)
-    self.batch_size = self.model.get_batch_size() * self.num_gpus
-    if self.mode in (constants.BenchmarkMode.TRAIN,
-                     constants.BenchmarkMode.TRAIN_AND_EVAL):
-      self.train_batch_size = self.batch_size
-    else:
-      self.train_batch_size = None
-    if self.mode in (constants.BenchmarkMode.EVAL,
-                     constants.BenchmarkMode.TRAIN_AND_EVAL):
-      if self.params.eval_batch_size > 0:
-        self.eval_batch_size = self.params.eval_batch_size * self.num_gpus
-      else:
-        self.eval_batch_size = self.batch_size
-    else:
-      self.eval_batch_size = None
-    self.batch_group_size = self.params.batch_group_size
-    self.enable_auto_loss_scale = (
-        self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale)
-    self.loss_scale = None
-    self.loss_scale_normal_steps = None
-
-    self.job_name = self.params.job_name  # "" for local training
-
-    # PS server is used for distributed jobs not using all-reduce.
-    use_ps_server = self.job_name and (self.params.variable_update !=
-                                       'distributed_all_reduce' and
-                                       self.params.variable_update !=
-                                       'collective_all_reduce')
-    # controller is used for distributed_all_reduce with > 1 worker.
-    use_controller = (
-        self.params.variable_update == 'distributed_all_reduce' and
-        self.job_name)
-    if use_controller and not params.controller_host:
-      raise ValueError('When variable_update==distributed_all_reduce '
-                       'controller_host must also be specified.')
-    self.single_session = (
-        self.params.variable_update == 'distributed_all_reduce')
-    # collective_all_reduce doesn't need a controller or ps
-    self.distributed_collective = (
-        self.params.variable_update == 'collective_all_reduce' and
-        self.job_name)
-
-    self.local_parameter_device_flag = self.params.local_parameter_device
-    if self.job_name:
-      self.task_index = self.params.task_index
-      self.cluster_manager = platforms_util.get_cluster_manager(
-          params, create_config_proto(params))
-      assert isinstance(self.cluster_manager, cnn_util.BaseClusterManager)
-
-      worker_prefix = '/job:worker/replica:0/task:%s' % self.task_index
-      if use_ps_server:
-        self.param_server_device = tf.train.replica_device_setter(
-            worker_device=worker_prefix + '/cpu:0',
-            cluster=self.cluster_manager.get_cluster_spec())
-        # This device on which the queues for managing synchronization between
-        # servers should be stored.
-        self.sync_queue_devices = [
-            '/job:ps/replica:0/task:%s/cpu:0' % i
-            for i in range(self.cluster_manager.num_ps())
-        ]
-      else:
-        self.sync_queue_devices = ['/job:worker/replica:0/task:0/cpu:0']
-    else:
-      self.task_index = 0
-      self.cluster_manager = None
-      worker_prefix = ''
-      self.param_server_device = '/%s:0' % self.params.local_parameter_device
-      self.sync_queue_devices = [self.param_server_device]
-
-    if self.cluster_manager:
-      self.num_workers = self.cluster_manager.num_workers()
-    elif self.params.variable_update == 'horovod':
-      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-      self.num_workers = hvd.size()
-    else:
-      self.num_workers = 1
-    self.num_ps = self.cluster_manager.num_ps() if self.cluster_manager else 0
-
-    if self.num_workers > 1 and self.params.all_reduce_spec == 'nccl':
-      raise ValueError('--all_reduce_spec=nccl is invalid in a '
-                       'multi-worker job')
-
-    # Device to use for ops that need to always run on the local worker's CPU.
-    self.cpu_device = '%s/cpu:0' % worker_prefix
-
-    # Device to use for ops that need to always run on the local worker's
-    # compute device, and never on a parameter server device.
-    self.raw_devices = [
-        '%s/%s:%i' % (worker_prefix, self.params.device, i)
-        for i in xrange(self.num_gpus)
-    ]
-
-    subset = 'validation' if params.eval else 'train'
-    self.num_batches, self.num_epochs = get_num_batches_and_epochs(
-        params, self.batch_size * self.num_workers,
-        self.dataset.num_examples_per_epoch(subset))
-    if self.mode in (constants.BenchmarkMode.EVAL,
-                     constants.BenchmarkMode.TRAIN_AND_EVAL):
-      # TODO(reedwm): Currently we do extra eval logic for num_eval_batches and
-      # the preprocessor. We should encapsulate this logic into a shared
-      # function or class.
-      if params.num_eval_batches is None and params.num_eval_epochs is None:
-        eval_params = self.params
-      else:
-        eval_params = self.params._replace(
-            num_batches=self.params.num_eval_batches,
-            num_epochs=self.params.num_eval_epochs)
-      self.num_eval_batches, self.num_eval_epochs = get_num_batches_and_epochs(
-          eval_params, self.eval_batch_size * self.num_workers,
-          self.dataset.num_examples_per_epoch('validation'))
-    else:
-      self.num_eval_batches, self.num_eval_epochs = None, None
-
-    num_train_examples_per_epoch = self.dataset.num_examples_per_epoch('train')
-    if self.params.eval_during_training_every_n_epochs:
-      n_epochs = self.params.eval_during_training_every_n_epochs
-      self.eval_during_training_at_specified_steps = {
-          (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
-           self.batch_size)
-          for e in np.arange(n_epochs, self.num_epochs, n_epochs)}
-
-    if self.params.eval_during_training_at_specified_steps:
-      try:
-        self.eval_during_training_at_specified_steps = set(map(
-            int, self.params.eval_during_training_at_specified_steps))
-      except ValueError:
-        raise ValueError('Param eval_during_training_at_specified_steps value '
-                         'of %s cannot be converted to a list of integers.' %
-                         (self.params.eval_during_training_at_specified_steps))
-
-    if self.params.eval_during_training_at_specified_epochs:
-      try:
-        n_epochs = list(map(
-            float, self.params.eval_during_training_at_specified_epochs))
-        offset = n_epochs[0] - 1
-        if offset.is_integer():
-          offset = int(offset)
-        mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
-        self.eval_during_training_at_specified_steps = {
-            (int(e * num_train_examples_per_epoch + self.batch_size - 1) //
-             self.batch_size)
-            for e in n_epochs}
-      except ValueError:
-        raise ValueError('Param eval_during_training_at_specified_epochs value '
-                         'of %s cannot be converted to a list of floats.' %
-                         (self.params.eval_during_training_at_specified_epochs))
-
-    if params.eval_during_training_every_n_epochs:
-      offset = params.eval_during_training_every_n_epochs - 1
-      if offset.is_integer():
-        offset = int(offset)
-      mlperf.logger.log(key=mlperf.tags.EVAL_EPOCH_OFFSET, value=offset)
-
-    if (self.params.staged_vars and
-        self.params.variable_update != 'parameter_server'):
-      raise ValueError('staged_vars for now is only supported with '
-                       'variable_update=parameter_server')
-
-    if self.params.variable_update == 'parameter_server':
-      if self.job_name:
-        if not self.params.staged_vars:
-          self.variable_mgr = variable_mgr.VariableMgrDistributedFetchFromPS(
-              self)
-        else:
-          self.variable_mgr = (
-              variable_mgr.VariableMgrDistributedFetchFromStagedPS(self))
-      else:
-        if not self.params.staged_vars:
-          self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(self)
-        else:
-          self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromStagedPS(
-              self)
-    elif self.params.variable_update == 'replicated':
-      if self.job_name:
-        raise ValueError('Invalid variable_update in distributed mode: %s' %
-                         self.params.variable_update)
-      self.variable_mgr = variable_mgr.VariableMgrLocalReplicated(
-          self, self.params.all_reduce_spec,
-          self.params.agg_small_grads_max_bytes,
-          self.params.agg_small_grads_max_group,
-          self.params.allreduce_merge_scope)
-    elif self.params.variable_update == 'distributed_all_reduce':
-      assert self.params.cross_replica_sync
-      self.variable_mgr = variable_mgr.VariableMgrDistributedAllReduce(
-          self, self.params.all_reduce_spec,
-          ('worker' if self.num_workers > 1 else 'localhost'),
-          self.num_workers, self.params.agg_small_grads_max_bytes,
-          self.params.agg_small_grads_max_group,
-          self.params.allreduce_merge_scope)
-    elif self.params.variable_update == 'collective_all_reduce':
-      assert self.params.cross_replica_sync
-      self.variable_mgr = variable_mgr.VariableMgrCollectiveAllReduce(
-          self, self.params.all_reduce_spec,
-          self.num_workers, self.num_gpus, self.task_index,
-          self.params.allreduce_merge_scope)
-    elif self.params.variable_update == 'distributed_replicated':
-      assert self.params.cross_replica_sync
-      if not self.job_name:
-        raise ValueError('Invalid variable_update in local mode: %s' %
-                         self.params.variable_update)
-      self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated(self)
-    elif self.params.variable_update in ('independent', 'horovod'):
-      if self.job_name:
-        raise ValueError('Invalid variable_update in distributed mode: %s' %
-                         self.params.variable_update)
-      self.variable_mgr = variable_mgr.VariableMgrIndependent(self)
-    else:
-      raise ValueError(
-          'Invalid variable_update: %s' % self.params.variable_update)
-
-    # Device to use for running on the local worker's compute device, but
-    # with variables assigned to parameter server devices.
-    self.devices = self.variable_mgr.get_devices()
-    if self.job_name:
-      if use_ps_server:
-        self.global_step_device = self.param_server_device
-      elif self.params.variable_update == 'collective_all_reduce':
-        self.global_step_device = self.cpu_device
-      else:
-        self.global_step_device = '/job:worker/replica:0/task:0/cpu:0'
-    else:
-      self.global_step_device = self.cpu_device
-
-    self.input_preprocessor = None
-    self.eval_input_preprocessor = None
-    if not self.dataset.use_synthetic_gpu_inputs():
-      if not self.params.eval:
-        self.input_preprocessor = self.get_input_preprocessor()
-      if self.mode in (constants.BenchmarkMode.EVAL,
-                       constants.BenchmarkMode.TRAIN_AND_EVAL):
-        with self._do_eval():
-          self.eval_input_preprocessor = self.get_input_preprocessor()
-    self.datasets_use_prefetch = (
-        self.params.datasets_use_prefetch and
-        # TODO(rohanj): Figure out why --datasets_use_prefetch freezes on the
-        # CPU.
-        self.params.device.lower() != 'cpu' and
-        self.input_preprocessor and
-        self.input_preprocessor.supports_datasets())
-    self.init_global_step = 0
-
-    self._config_benchmark_logger()
-
-    if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
-      # Remove "eval" from params so it is not accidentally used. Since eval can
-      # still occur despite params.eval being False, params.eval should never
-      # be used. We cannot yet remove this unconditionally, because the SSD
-      # model still uses params.eval, and hence does not work properly with
-      # --eval_during_training_*.
-      # TODO(b/116627045): We should also remove fields that have an eval
-      # equivalent, like num_batches and num_eval_batches.
-      self.params = remove_param_fields(self.params, {'eval'})
-
-  @contextlib.contextmanager
-  def _do_eval(self):
-    """Context manager to switches BenchmarkCNN to eval mode.
-
-    Any evaluation code should be put under this context manager. This context
-    manager switches self._doing_eval to True. It also switches certain
-    attributes, like self.num_batches and self.num_epochs, to be the number of
-    batches and epochs for evaluation respectively
-
-    Yields:
-      Nothing.
-    """
-    # TODO(b/116627045): Find a more general way of switching attributes to the
-    # eval equivalents.
-    old_doing_eval = self._doing_eval
-    old_num_batches = self.num_batches
-    old_num_epochs = self.num_epochs
-    old_batch_size = self.batch_size
-    try:
-      self._doing_eval = True
-      self.num_batches = self.num_eval_batches
-      self.num_epochs = self.num_eval_epochs
-      self.batch_size = self.eval_batch_size
-      self.model.set_batch_size(self.eval_batch_size // self.num_gpus)
-      yield
-    finally:
-      self._doing_eval = old_doing_eval
-      self.num_batches = old_num_batches
-      self.num_epochs = old_num_epochs
-      self.batch_size = old_batch_size
-      self.model.set_batch_size(old_batch_size // self.num_gpus)
-
-  def _config_benchmark_logger(self):
-    """Config the model garden benchmark logger."""
-    model_benchmark_logger = None
-    if self.params.benchmark_log_dir is not None:
-      try:
-        from official.r1.utils.logs import logger as models_logger  # pylint: disable=g-import-not-at-top
-      except ImportError:
-        tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH '
-                         'in order to use BenchmarkLogger. Configured '
-                         'benchmark_log_dir: %s'
-                         % self.params.benchmark_log_dir)
-        raise
-      model_benchmark_logger = models_logger.BenchmarkFileLogger(
-          self.params.benchmark_log_dir)
-    self.benchmark_logger = model_benchmark_logger
-
-  # TODO(laigd): this changes the global device list which is used everywhere,
-  # consider refactoring it.
-  def reset_devices_for_task(self, task_num, is_local=False):
-    """Used to imitate another task when building a distributed graph."""
-    worker_prefix = ('/job:localhost' if is_local else
-                     '/job:worker/replica:0/task:%s' % task_num)
-    self.cpu_device = '%s/cpu:0' % worker_prefix
-    self.raw_devices = [
-        '%s/%s:%i' % (worker_prefix, self.params.device, i)
-        for i in xrange(self.num_gpus)
-    ]
-    self.devices = self.variable_mgr.get_devices()
-
-  def raw_devices_across_tasks(self, is_local=False):
-    """Returns list of raw device names across all tasks."""
-    if is_local:
-      assert self.num_workers == 1
-      return self.raw_devices
-    else:
-      return [
-          'job:worker/replica:0/task%s/%s:%i' % (t, self.params.device, i)
-          for t in xrange(self.num_workers)
-          for i in xrange(self.num_gpus)
-      ]
-
-  def print_info(self):
-    """Print basic information."""
-    benchmark_info = self._get_params_info()
-    log_fn('Model:       %s' % self.model.get_model_name())
-    log_fn('Dataset:     %s' % benchmark_info['dataset_name'])
-    log_fn('Mode:        %s' % self.mode)
-    log_fn('SingleSess:  %s' % benchmark_info['single_session'])
-    log_fn('Batch size:  %s global' % (self.batch_size * self.num_workers))
-    log_fn('             %s per device' % (self.batch_size //
-                                           len(self.raw_devices)))
-    if self.batch_group_size > 1:
-      log_fn('             %d batches per prepocessing group' %
-             self.batch_group_size)
-    log_fn('Num batches: %d' % self.num_batches)
-    log_fn('Num epochs:  %.2f' % self.num_epochs)
-    log_fn('Devices:     %s' % benchmark_info['device_list'])
-    log_fn('NUMA bind:   %s' % self.params.use_numa_affinity)
-    log_fn('Data format: %s' % self.params.data_format)
-    if self.rewriter_config:
-      log_fn('RewriterConfig: %s' % self.rewriter_config)
-    log_fn('Optimizer:   %s' % self.params.optimizer)
-    log_fn('Variables:   %s' % self.params.variable_update)
-    if (self.params.variable_update == 'replicated' or
-        self.params.variable_update == 'distributed_all_reduce'
-        or self.params.variable_update == 'collective_all_reduce'):
-      log_fn('AllReduce:   %s' % self.params.all_reduce_spec)
-    if self.job_name:
-      log_fn('Sync:        %s' % self.params.cross_replica_sync)
-    if self.params.staged_vars:
-      log_fn('Staged vars: %s' % self.params.staged_vars)
-    if self.params.variable_update == 'horovod' and self.params.horovod_device:
-      log_fn('Horovod on:  %s' % self.params.horovod_device)
-    log_fn('==========')
-
-  def _get_params_info(self):
-    """Get the common parameters info for the benchmark run.
-
-    Returns:
-      A dict of processed parameters.
-    """
-    dataset_name = self.dataset.name
-    if self.dataset.use_synthetic_gpu_inputs():
-      dataset_name += ' (synthetic)'
-    single_session = self.params.variable_update == 'distributed_all_reduce'
-    if single_session:
-      device_list = self.raw_devices_across_tasks()
-    elif self.params.variable_update == 'horovod':
-      device_list = ['horovod/%s:%d' % (self.params.device, idx)
-                     for idx in range(self.num_workers)]
-    else:
-      device_list = self.raw_devices
-    return {
-        'dataset_name': dataset_name,
-        'single_session': single_session,
-        'device_list': device_list,}
-
-  def _log_benchmark_run(self):
-    """Log the benchmark info to the logger.
-
-    The info logged here should be similar to print_info(), but in a structured
-    JSON format.
-    """
-    if self.benchmark_logger:
-      benchmark_info = self._get_params_info()
-
-      run_param = {
-          'model': self.model.get_model_name(),
-          'dataset': benchmark_info['dataset_name'],
-          'mode': self.mode,
-          'single_sess': benchmark_info['single_session'],
-          'devices': benchmark_info['device_list'],
-          'batch_size': self.batch_size,
-          'batch_size_per_device': self.batch_size // len(self.raw_devices),
-          'num_batches': self.num_batches,
-          'num_epochs': self.num_epochs,
-          'data_format': self.params.data_format,
-          'rewrite_config': self.rewriter_config,
-          'optimizer': self.params.optimizer,
-          'session_config': create_config_proto(self.params),
-      }
-      # TODO(scottzhu): tf_cnn_benchmark might execute several times with
-      # different param setting on the same box. This will cause the run file to
-      # only contain the latest info. The benchmark_log_dir should be updated
-      # for every new run.
-      self.benchmark_logger.log_run_info(
-          self.model.get_model_name(), benchmark_info['dataset_name'],
-          run_param, test_id=self.params.benchmark_test_id)
-
-  def run(self):
-    """Run the benchmark task assigned to this process.
-
-    Returns:
-      Dictionary of statistics for training or eval.
-    Raises:
-       ValueError: unrecognized job name.
-    """
-    if self.params.job_name == 'ps':
-      log_fn('Running parameter server %s' % self.task_index)
-      self.cluster_manager.join_server()
-      return {}
-
-    # For distributed_all_reduce with multiple workers, drive
-    # from a separate controller process.
-    if self.params.variable_update == 'distributed_all_reduce':
-      if self.params.job_name == 'worker':
-        log_fn('Starting worker %s' % self.task_index)
-        self.cluster_manager.join_server()
-        return
-      elif self.params.job_name and self.params.job_name != 'controller':
-        raise ValueError('unrecognized job name: %s' % self.params.job_name)
-
-    self._log_benchmark_run()
-    if self._doing_eval:
-      with tf.Graph().as_default():
-        # TODO(laigd): freeze the graph in eval mode.
-        return self._run_eval()
-    else:
-      return self._benchmark_train()
-
-  def _run_eval(self):
-    """Evaluate a model every self.params.eval_interval_secs.
-
-    Returns:
-      Dictionary containing eval statistics. Currently returns an empty
-      dictionary.
-
-    Raises:
-      ValueError: If self.params.train_dir is unspecified.
-    """
-    if self.params.train_dir is None:
-      raise ValueError('Trained model directory not specified')
-    graph_info = self._build_eval_graph()
-    saver = tf.train.Saver(self.variable_mgr.savable_variables())
-    summary_writer = tf.summary.FileWriter(self.params.eval_dir,
-                                           tf.get_default_graph())
-    target = ''
-    # TODO(huangyp): Check if checkpoints haven't updated for hours and abort.
-    while True:
-      with tf.Session(
-          target=target, config=create_config_proto(self.params)) as sess:
-        image_producer = None
-        try:
-          global_step = load_checkpoint(saver, sess, self.params.train_dir)
-          image_producer = self._initialize_eval_graph(
-              graph_info.enqueue_ops, graph_info.input_producer_op,
-              graph_info.local_var_init_op_group, sess)
-        except CheckpointNotFoundException:
-          log_fn('Checkpoint not found in %s' % self.params.train_dir)
-        else:  # Only executes if an exception was not thrown
-          self._eval_once(sess, summary_writer, graph_info.fetches,
-                          graph_info.summary_op, image_producer, global_step)
-        if image_producer is not None:
-          image_producer.done()
-        if self.params.eval_interval_secs <= 0:
-          break
-        time.sleep(self.params.eval_interval_secs)
-    return {}
-
-  def _build_eval_graph(self, scope_name=None):
-    """Build the evaluation graph.
-
-    Args:
-      scope_name: String to filter what summaries are collected. Only summary
-        ops whose name contains `scope_name` will be added, which is useful for
-        only including evaluation ops.
-
-    Returns:
-      A GraphInfo named_tuple containing various useful ops and tensors of the
-      evaluation grpah.
-    """
-    with self._do_eval():
-      input_producer_op, enqueue_ops, fetches = self._build_model()
-      local_var_init_op = tf.local_variables_initializer()
-      table_init_ops = tf.tables_initializer()
-      variable_mgr_init_ops = [local_var_init_op]
-      if table_init_ops:
-        variable_mgr_init_ops.extend([table_init_ops])
-      with tf.control_dependencies([local_var_init_op]):
-        variable_mgr_init_ops.extend(self.variable_mgr.get_post_init_ops())
-      local_var_init_op_group = tf.group(*variable_mgr_init_ops)
-
-      summary_op = tf.summary.merge_all(scope=scope_name)
-      # The eval graph has no execution barrier because it doesn't run in
-      # distributed mode.
-      execution_barrier = None
-      # We do not use the global step during evaluation.
-      global_step = None
-      return GraphInfo(input_producer_op, enqueue_ops, fetches,
-                       execution_barrier, global_step, local_var_init_op_group,
-                       summary_op)
-
-  # TODO(reedwm): For consistency, we should have a similar
-  # "_initialize_train_graph" function. They can likely be the same function.
-  def _initialize_eval_graph(self, enqueue_ops, input_producer_op,
-                             local_var_init_op_group, sess):
-    """Initializes the evaluation graph.
-
-    Args:
-      enqueue_ops: Ops that adds the preprocessed images to the staging areas.
-      input_producer_op: Op that produce the input batches (before
-        preprocessing).
-      local_var_init_op_group: Group of ops that perform per-device
-        initialization work.
-      sess: The session to initialize the eval graph with.
-
-    Returns:
-      An ImageProducer, or None if an ImageProducer isn't being used.
-    """
-    with self._do_eval():
-      if local_var_init_op_group is not None:
-        # We might reinitialize local variables if they were already initialized
-        # during training. This is OK.
-        sess.run(local_var_init_op_group)
-      if self.dataset.queue_runner_required():
-        tf.train.start_queue_runners(sess=sess)
-      image_producer = None
-      if input_producer_op is not None:
-        image_producer = cnn_util.ImageProducer(
-            sess, input_producer_op, self.batch_group_size,
-            self.params.use_python32_barrier)
-        image_producer.start()
-      if enqueue_ops:
-        for i in xrange(len(enqueue_ops)):
-          sess.run(enqueue_ops[:(i + 1)])
-          if image_producer is not None:
-            image_producer.notify_image_consumption()
-      return image_producer
-
-  def _eval_once(self, sess, summary_writer, fetches, summary_op,
-                 image_producer, global_step):
-    """Evaluate the model using the validation dataset."""
-    with self._do_eval():
-      mlperf.logger.log_eval_epoch(
-          mlperf.tags.EVAL_START, global_step, self.batch_size)
-      loop_start_time = start_time = time.perf_counter()
-      # TODO(laigd): refactor the part to compute/report the accuracy. Currently
-      # it only works for image models.
-      top_1_accuracy_sum = 0.0
-      top_5_accuracy_sum = 0.0
-      total_eval_count = self.num_batches * self.batch_size
-      for step in xrange(self.num_batches):
-        if (summary_writer and self.params.save_summaries_steps > 0 and
-            (step + 1) % self.params.save_summaries_steps == 0):
-          results, summary_str = sess.run([fetches, summary_op])
-          summary_writer.add_summary(summary_str)
-        else:
-          results = sess.run(fetches)
-        # Make global_step available in results for postprocessing.
-        results['global_step'] = global_step
-        results = self.model.postprocess(results)
-        top_1_accuracy_sum += results['top_1_accuracy']
-        top_5_accuracy_sum += results['top_5_accuracy']
-        if (step + 1) % self.params.display_every == 0:
-          duration = time.perf_counter() - start_time
-          examples_per_sec = (
-              self.batch_size * self.params.display_every / duration)
-          log_fn('%i\t%.1f examples/sec' % (step + 1, examples_per_sec))
-          start_time = time.perf_counter()
-        if image_producer is not None:
-          image_producer.notify_image_consumption()
-      loop_end_time = time.perf_counter()
-      accuracy_at_1 = top_1_accuracy_sum / self.num_batches
-      accuracy_at_5 = top_5_accuracy_sum / self.num_batches
-      summary = tf.Summary()
-      summary.value.add(tag='eval/Accuracy@1', simple_value=accuracy_at_1)
-      summary.value.add(tag='eval/Accuracy@5', simple_value=accuracy_at_5)
-      for result_key, result_value in results.items():
-        if result_key.startswith(constants.SIMPLE_VALUE_RESULT_PREFIX):
-          prefix_len = len(constants.SIMPLE_VALUE_RESULT_PREFIX)
-          summary.value.add(tag='eval/' + result_key[prefix_len:],
-                            simple_value=result_value)
-      if summary_writer:
-        summary_writer.add_summary(summary, global_step)
-      log_fn('Accuracy @ 1 = %.4f Accuracy @ 5 = %.4f [%d examples]' %
-             (accuracy_at_1, accuracy_at_5, total_eval_count))
-      elapsed_time = loop_end_time - loop_start_time
-      images_per_sec = (self.num_batches * self.batch_size / elapsed_time)
-      if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
-        # Note that we compute the top 1 accuracy and top 5 accuracy for each
-        # batch, which will have a slight performance impact.
-        log_fn('-' * 64)
-        log_fn('total images/sec: %.2f' % images_per_sec)
-        log_fn('-' * 64)
-      if self.benchmark_logger:
-        eval_result = {
-            'eval_top_1_accuracy', accuracy_at_1,
-            'eval_top_5_accuracy', accuracy_at_5,
-            'eval_average_examples_per_sec', images_per_sec,
-            tf.GraphKeys.GLOBAL_STEP, global_step,
-        }
-        self.benchmark_logger.log_evaluation_result(eval_result)
-      mlperf.logger.log_eval_epoch(
-          mlperf.tags.EVAL_STOP, global_step, self.batch_size)
-      mlperf.logger.log(key=mlperf.tags.EVAL_SIZE,
-                        value=self.num_batches * self.batch_size)
-      if self.params.model != 'ssd300':  # ssd300 logs eval accuracy elsewhere.
-        mlperf.logger.log_eval_accuracy(
-            accuracy_at_1, global_step, self.train_batch_size,
-            examples_per_epoch=self.dataset.num_examples_per_epoch('train'))
-      if self.params.stop_at_top_1_accuracy:
-        mlperf.logger.log(key=mlperf.tags.EVAL_TARGET,
-                          value=self.params.stop_at_top_1_accuracy)
-      return accuracy_at_1, accuracy_at_5
-
-  def _benchmark_train(self):
-    """Run cnn in benchmark mode. Skip the backward pass if forward_only is on.
-
-    Returns:
-      Dictionary containing training statistics (num_workers, num_steps,
-      average_wall_time, images_per_sec).
-    """
-    graph = tf.Graph()
-    with graph.as_default():
-      build_result = self._build_graph()
-      if self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL:
-        with self.variable_mgr.reuse_variables():
-          with tf.name_scope('Evaluation') as ns:
-            eval_build_results = self._build_eval_graph(ns)
-      else:
-        eval_build_results = None
-    (graph, result_to_benchmark) = self._preprocess_graph(graph, build_result)
-    with graph.as_default():
-      return self._benchmark_graph(result_to_benchmark, eval_build_results)
-
-  GPU_CACHED_INPUT_VARIABLE_NAME = 'gpu_cached_inputs'
-
-  def _unfreezable_local_variables(self, graph):
-    """Get the local variables that we don't want to freeze."""
-    return graph.get_collection(
-        tf.GraphKeys.LOCAL_VARIABLES,
-        # We don't freeze the gpu_cached_images local variable so it won't get
-        # constant folded with ops which process the input.
-        scope='.*' + BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME)
-
-  def _build_graph(self):
-    """Build the graph.
-
-    Returns:
-      A namedtuple containing the ops/tensors that required by
-      _benchmark_graph().
-    """
-    if self.single_session:
-      (input_producer_op, enqueue_ops, fetches) = (
-          self._build_model_single_session())
-    else:
-      (input_producer_op, enqueue_ops, fetches) = self._build_model()
-    fetches_list = nest.flatten(list(fetches.values()))
-    main_fetch_group = tf.group(*fetches_list, name='main_fetch_group')
-    execution_barrier = None
-    if (not self.single_session and self.job_name and
-        not self.params.cross_replica_sync):
-      execution_barrier = self.add_sync_queues_and_barrier(
-          'execution_barrier_', [])
-
-    global_step = tf.train.get_global_step()
-    with tf.device(self.global_step_device), tf.name_scope('inc_global_step'):
-      with tf.control_dependencies([main_fetch_group]):
-        fetches['inc_global_step'] = global_step.assign_add(1)
-
-    if ((not self.single_session) and (not self.distributed_collective) and
-        self.job_name and self.params.cross_replica_sync):
-      # Block all replicas until all replicas are ready for next step.
-      fetches['sync_queues'] = self.add_sync_queues_and_barrier(
-          'sync_queues_step_end_', [main_fetch_group])
-
-    # Skips the init ops for freezable local variables in forward_only mode so
-    # we can remove all the assign ops when converting variables to constants.
-    with tf.name_scope('local_variable_initialization'):
-      if self.forward_only_and_freeze:
-        local_var_init_op = tf.variables_initializer(
-            self._unfreezable_local_variables(tf.get_default_graph()))
-      else:
-        local_var_init_op = tf.local_variables_initializer()
-    table_init_ops = tf.tables_initializer()
-
-    variable_manager_init_ops = [local_var_init_op]
-    if table_init_ops:
-      variable_manager_init_ops.extend([table_init_ops])
-    if not self.forward_only_and_freeze:
-      with tf.control_dependencies([local_var_init_op]):
-        variable_manager_init_ops.extend(self.variable_mgr.get_post_init_ops())
-    if ((not self.single_session) and (not self.distributed_collective) and
-        self.job_name and self.params.cross_replica_sync):
-      # Ensure all workers execute variable_manager_init_ops before they start
-      # executing the model.
-      variable_manager_init_ops.append(
-          self.add_sync_queues_and_barrier('init_ops_end_',
-                                           variable_manager_init_ops))
-    local_var_init_op_group = tf.group(*variable_manager_init_ops,
-                                       name='local_var_init_op_group')
-    summary_op = tf.summary.merge_all()
-
-    return GraphInfo(
-        input_producer_op=input_producer_op,
-        enqueue_ops=enqueue_ops,
-        fetches=fetches,
-        execution_barrier=execution_barrier,
-        global_step=global_step,
-        local_var_init_op_group=local_var_init_op_group,
-        summary_op=summary_op)
-
-  def _benchmark_graph(self, graph_info, eval_graph_info):
-    """Benchmark the training graph.
-
-    Args:
-      graph_info: the namedtuple returned by _build_graph() which
-        contains all necessary information to benchmark the graph, including
-        named tensors/ops list, fetches, etc.
-      eval_graph_info: Similar to graph_info but for the eval graph if
-        --eval_during_training_* is used. Otherwise, None.
-    Returns:
-      Dictionary containing training statistics (num_workers, num_steps,
-      average_wall_time, images_per_sec).
-    """
-    log_fn('Initializing graph')
-    if self.params.variable_update == 'horovod':
-      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-      # First worker will be 'chief' - it will write summaries and
-      # save checkpoints.
-      is_chief = hvd.rank() == 0
-    else:
-      is_chief = (not self.job_name or self.task_index == 0)
-
-    summary_writer = None
-    if (is_chief and self.params.summary_verbosity and self.params.train_dir and
-        self.params.save_summaries_steps > 0):
-      summary_writer = tf.summary.FileWriter(self.params.train_dir,
-                                             tf.get_default_graph())
-
-    # We want to start the benchmark timer right after a image_producer barrier
-    # and avoids undesired waiting times on barriers.
-    if ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
-        self.batch_group_size) != 0:
-      self.num_warmup_batches = int(
-          math.ceil(
-              (self.num_warmup_batches + len(graph_info.enqueue_ops) - 1.0) /
-              (self.batch_group_size)) * self.batch_group_size -
-          len(graph_info.enqueue_ops) + 1)
-      log_fn('Round up warm up steps to %d to match batch_group_size' %
-             self.num_warmup_batches)
-      assert ((self.num_warmup_batches + len(graph_info.enqueue_ops) - 1) %
-              self.batch_group_size) == 0
-    # We run the summaries in the same thread as the training operations by
-    # passing in None for summary_op to avoid a summary_thread being started.
-    # Running summaries and training operations in parallel could run out of
-    # GPU memory.
-    if is_chief and not self.forward_only_and_freeze:
-      saver = tf.train.Saver(
-          self.variable_mgr.savable_variables(),
-          save_relative_paths=True,
-          max_to_keep=self.params.max_ckpts_to_keep)
-    else:
-      saver = None
-    ready_for_local_init_op = None
-    if self.job_name and not (self.single_session or
-                              self.distributed_collective):
-      # In distributed mode, we don't want to run local_var_init_op_group until
-      # the global variables are initialized, because local_var_init_op_group
-      # may use global variables (such as in distributed replicated mode). We
-      # don't set this in non-distributed mode, because in non-distributed mode,
-      # local_var_init_op_group may itself initialize global variables (such as
-      # in replicated mode).
-      ready_for_local_init_op = tf.report_uninitialized_variables(
-          tf.global_variables())
-    if self.params.variable_update == 'horovod':
-      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-      bcast_global_variables_op = hvd.broadcast_global_variables(0)
-    else:
-      bcast_global_variables_op = None
-
-    if self.params.variable_update == 'collective_all_reduce':
-      # It doesn't matter what this collective_graph_key value is,
-      # so long as it's > 0 and the same at every worker.
-      init_run_options = tf.RunOptions()
-      init_run_options.experimental.collective_graph_key = 6
-    else:
-      init_run_options = tf.RunOptions()
-    local_var_init_ops = [graph_info.local_var_init_op_group]
-    if eval_graph_info:
-      # `eval_graph_info.local_var_init_op_group` also includes some of the
-      # training initializer ops, since it's difficult to filter them out.
-      # Rerunning the training initializer ops is OK, but we add a control
-      # dependency since running two sets of training initializer ops at the
-      # same time can cause race conditions.
-      with tf.control_dependencies(local_var_init_ops):
-        local_var_init_ops.append(eval_graph_info.local_var_init_op_group)
-    sv = tf.train.Supervisor(
-        # For the purpose of Supervisor, all Horovod workers are 'chiefs',
-        # since we want session to be initialized symmetrically on all the
-        # workers.
-        is_chief=is_chief or (self.params.variable_update == 'horovod'
-                              or self.distributed_collective),
-        # Log dir should be unset on non-chief workers to prevent Horovod
-        # workers from corrupting each other's checkpoints.
-        logdir=self.params.train_dir if is_chief else None,
-        ready_for_local_init_op=ready_for_local_init_op,
-        local_init_op=local_var_init_ops,
-        saver=saver,
-        global_step=graph_info.global_step,
-        summary_op=None,
-        save_model_secs=self.params.save_model_secs,
-        summary_writer=summary_writer,
-        local_init_run_options=init_run_options)
-
-    profiler = tf.profiler.Profiler() if self.params.tfprof_file else None
-    if self.graph_file is not None:
-      path, filename = os.path.split(self.graph_file)
-      as_text = filename.endswith('txt')
-      log_fn('Writing GraphDef as %s to %s' % (  # pyformat break
-          'text' if as_text else 'binary', self.graph_file))
-      tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
-                           path, filename, as_text)
-
-    start_standard_services = (
-        self.params.train_dir or
-        self.dataset.queue_runner_required())
-    target = self.cluster_manager.get_target() if self.cluster_manager else ''
-    with sv.managed_session(
-        master=target,
-        config=create_config_proto(self.params),
-        start_standard_services=start_standard_services) as sess:
-      # Anything that can potentially raise an OutOfRangeError with 'sess' MUST
-      # be under this try block. The managed_session() context manager silently
-      # ignores OutOfRangeError, so we must catch them and wrap them with
-      # a different exception type so that they can be propagated up to the
-      # caller.
-      try:
-        stats = self.benchmark_with_session(
-            sess, sv, graph_info, eval_graph_info, bcast_global_variables_op,
-            is_chief, summary_writer, profiler)
-      except tf.errors.OutOfRangeError:
-        raise RuntimeError(
-            'Received OutOfRangeError. Wrapping in Runtime error to avoid '
-            'Supervisor from suppressing the error. Original OutOfRangeError '
-            'with traceback:\n' + traceback.format_exc())
-
-    sv.stop()
-    if profiler:
-      generate_tfprof_profile(profiler, self.params.tfprof_file)
-    return stats
-
-  def benchmark_with_session(self, sess, supervisor, graph_info,
-                             eval_graph_info, bcast_global_variables_op,
-                             is_chief, summary_writer, profiler):
-    """Benchmarks the graph with the given session.
-
-    Args:
-      sess: The session to benchmark the graph with
-      supervisor: The Supervisor that created the session.
-      graph_info: the namedtuple returned by _build_graph() which
-        contains all necessary information to benchmark the graph, including
-        named tensors/ops list, fetches, etc.
-      eval_graph_info: Similar to graph_info but for the eval graph if
-        --eval_during_training_every_n_steps is used. Otherwise, None.
-      bcast_global_variables_op: If Horovod is used, the op to broadcast the
-        global variables to all the processes. None if Horovod is not used.
-      is_chief: True if this is the chief process.
-      summary_writer: The SummaryWriter used to write summaries, or None if
-        summaries are not used.
-      profiler: The tf.profiler.Profiler, or None if tfprof is not used.
-
-    Returns:
-      Dictionary containing training statistics (num_workers, num_steps,
-      average_wall_time, images_per_sec).
-    """
-    if self.params.backbone_model_path is not None:
-      self.model.load_backbone_model(sess, self.params.backbone_model_path)
-    if bcast_global_variables_op:
-      sess.run(bcast_global_variables_op)
-    image_producer = None
-    if graph_info.input_producer_op is not None:
-      image_producer = cnn_util.ImageProducer(
-          sess, graph_info.input_producer_op, self.batch_group_size,
-          self.params.use_python32_barrier)
-      image_producer.start()
-    if graph_info.enqueue_ops:
-      for i in xrange(len(graph_info.enqueue_ops)):
-        sess.run(graph_info.enqueue_ops[:(i + 1)])
-        if image_producer is not None:
-          image_producer.notify_image_consumption()
-    self.init_global_step, = sess.run([graph_info.global_step])
-    if self.job_name and not self.params.cross_replica_sync:
-      # TODO(zhengxq): Do we need to use a global step watcher at all?
-      global_step_watcher = GlobalStepWatcher(
-          sess, graph_info.global_step,
-          self.num_workers * self.num_warmup_batches +
-          self.init_global_step,
-          self.num_workers * (self.num_warmup_batches + self.num_batches) - 1)
-      global_step_watcher.start()
-    else:
-      global_step_watcher = None
-    eval_image_producer = None
-    if eval_graph_info:
-      # We pass local_var_init_op_group=None because the Supervisor already
-      # initialized local variables above. We need to have the Supervisor
-      # initialize the local variables, because otherwise it throws an error
-      # complaining that not all variables were initialized.
-      eval_image_producer = self._initialize_eval_graph(
-          eval_graph_info.enqueue_ops, eval_graph_info.input_producer_op,
-          local_var_init_op_group=None, sess=sess)
-    step_train_times = []
-    log_fn('Running warm up')
-    local_step = -1 * self.num_warmup_batches
-    if self.single_session:
-      # In single session mode, each step, the global_step is incremented by
-      # 1. In non-single session mode, each step, the global_step is
-      # incremented once per worker. This means we need to divide
-      # init_global_step by num_workers only in non-single session mode.
-      end_local_step = self.num_batches - self.init_global_step
-    else:
-      end_local_step = self.num_batches - (self.init_global_step //
-                                           self.num_workers)
-    if not global_step_watcher:
-      # In cross-replica sync mode, all workers must run the same number of
-      # local steps, or else the workers running the extra step will block.
-      done_fn = lambda: local_step >= end_local_step
-    else:
-      done_fn = global_step_watcher.done
-    if self.params.debugger is not None:
-      if self.params.debugger == 'cli':
-        log_fn('The CLI TensorFlow debugger will be used.')
-        sess = tf_debug.LocalCLIDebugWrapperSession(sess)
-      else:
-        log_fn('The TensorBoard debugger plugin will be used.')
-        sess = tf_debug.TensorBoardDebugWrapperSession(sess,
-                                                       self.params.debugger)
-    mlperf.logger.log(key=mlperf.tags.TRAIN_LOOP)
-    skip_final_eval = False
-    accuracy_at_1 = None
-    accuracy_at_5 = None
-    last_eval_step = local_step
-    loop_start_time = time.perf_counter()
-    last_average_loss = None
-    while not done_fn():
-      if local_step == 0:
-        log_fn('Done warm up')
-        if graph_info.execution_barrier:
-          log_fn('Waiting for other replicas to finish warm up')
-          sess.run([graph_info.execution_barrier])
-
-        # TODO(laigd): rename 'Img' to maybe 'Input'.
-        header_str = ('Step\tImg/sec\t' +
-                      self.params.loss_type_to_report.replace('/', ' '))
-        if self.params.print_training_accuracy or self.params.forward_only:
-          # TODO(laigd): use the actual accuracy op names of the model.
-          header_str += '\ttop_1_accuracy\ttop_5_accuracy'
-        log_fn(header_str)
-        assert len(step_train_times) == self.num_warmup_batches
-        # reset times to ignore warm up batch
-        step_train_times = []
-        loop_start_time = time.perf_counter()
-      if (summary_writer and
-          (local_step + 1) % self.params.save_summaries_steps == 0):
-        fetch_summary = graph_info.summary_op
-      else:
-        fetch_summary = None
-      collective_graph_key = 7 if (
-          self.params.variable_update == 'collective_all_reduce') else 0
-      (summary_str, last_average_loss) = benchmark_one_step(
-          sess, graph_info.fetches, local_step,
-          self.batch_size * (self.num_workers
-                             if self.single_session else 1), step_train_times,
-          self.trace_filename, self.params.partitioned_graph_file_prefix,
-          profiler, image_producer, self.params, fetch_summary,
-          benchmark_logger=self.benchmark_logger,
-          collective_graph_key=collective_graph_key,
-          should_output_files=(self.params.variable_update != 'horovod' or
-                               is_chief))
-      if summary_str is not None and is_chief:
-        supervisor.summary_computed(sess, summary_str)
-      local_step += 1
-      if (self.params.save_model_steps and
-          local_step % self.params.save_model_steps == 0 and
-          local_step > 0 and
-          is_chief):
-        supervisor.saver.save(sess, supervisor.save_path,
-                              supervisor.global_step)
-      if (eval_graph_info and local_step > 0 and not done_fn() and
-          self._should_eval_during_training(local_step)):
-        python_global_step = sess.run(graph_info.global_step)
-        num_steps_since_last_eval = local_step - last_eval_step
-        # The INPUT_SIZE tag value might not match the
-        # PREPROC_NUM_TRAIN_EXAMPLES tag value, because the number of examples
-        # run, which is INPUT_SIZE, is rounded up to the nearest multiple of
-        # self.batch_size.
-        mlperf.logger.log(
-            key=mlperf.tags.INPUT_SIZE,
-            value=num_steps_since_last_eval * self.batch_size)
-        log_fn('Running evaluation at global_step {}'.format(
-            python_global_step))
-        accuracy_at_1, accuracy_at_5 = self._eval_once(
-            sess, summary_writer, eval_graph_info.fetches,
-            eval_graph_info.summary_op, eval_image_producer,
-            python_global_step)
-        last_eval_step = local_step
-        if (self.params.stop_at_top_1_accuracy and
-            accuracy_at_1 >= self.params.stop_at_top_1_accuracy):
-          log_fn('Stopping, as eval accuracy at least %s was reached' %
-                 self.params.stop_at_top_1_accuracy)
-          skip_final_eval = True
-          break
-        else:
-          log_fn('Resuming training')
-      if eval_graph_info and self.model.reached_target():
-        log_fn('Stopping, as the model indicates its custom goal was reached')
-        skip_final_eval = True
-        break
-    loop_end_time = time.perf_counter()
-    # Waits for the global step to be done, regardless of done_fn.
-    if global_step_watcher:
-      while not global_step_watcher.done():
-        time.sleep(.25)
-    if not global_step_watcher:
-      elapsed_time = loop_end_time - loop_start_time
-      average_wall_time = elapsed_time / local_step if local_step > 0 else 0
-      images_per_sec = (self.num_workers * local_step * self.batch_size /
-                        elapsed_time)
-      num_steps = local_step * self.num_workers
-    else:
-      # NOTE: Each worker independently increases the global step. So,
-      # num_steps will be the sum of the local_steps from each worker.
-      num_steps = global_step_watcher.num_steps()
-      elapsed_time = global_step_watcher.elapsed_time()
-      average_wall_time = (elapsed_time * self.num_workers / num_steps
-                           if num_steps > 0 else 0)
-      images_per_sec = num_steps * self.batch_size / elapsed_time
-
-    # We skip printing images/sec if --eval_during_training_* is specified,
-    # because we are both processing training and evaluation images, so a
-    # singular "images/sec" value is meaningless.
-    if self.mode != constants.BenchmarkMode.TRAIN_AND_EVAL:
-      log_fn('-' * 64)
-      # TODO(laigd): rename 'images' to maybe 'inputs'.
-      log_fn('total images/sec: %.2f' % images_per_sec)
-      log_fn('-' * 64)
-    else:
-      log_fn('Done with training')
-    num_steps_since_last_eval = local_step - last_eval_step
-    mlperf.logger.log(
-        key=mlperf.tags.INPUT_SIZE,
-        value=num_steps_since_last_eval * self.batch_size)
-    python_global_step = sess.run(graph_info.global_step)
-    if eval_graph_info and not skip_final_eval:
-      log_fn('Running final evaluation at global_step {}'.format(
-          python_global_step))
-      accuracy_at_1, accuracy_at_5 = self._eval_once(
-          sess, summary_writer, eval_graph_info.fetches,
-          eval_graph_info.summary_op, eval_image_producer, python_global_step)
-    num_epochs_ran = (python_global_step * self.batch_size /
-                      self.dataset.num_examples_per_epoch('train'))
-    mlperf.logger.log_train_epochs(num_epochs_ran)
-    if image_producer is not None:
-      image_producer.done()
-    if eval_image_producer is not None:
-      eval_image_producer.done()
-    if is_chief:
-      if self.benchmark_logger:
-        self.benchmark_logger.log_metric(
-            'average_examples_per_sec', images_per_sec, global_step=num_steps)
-
-    # Save the model checkpoint.
-    if self.params.train_dir is not None and is_chief:
-      checkpoint_path = os.path.join(self.params.train_dir, 'model.ckpt')
-      if not gfile.Exists(self.params.train_dir):
-        gfile.MakeDirs(self.params.train_dir)
-      supervisor.saver.save(sess, checkpoint_path, graph_info.global_step)
-    if graph_info.execution_barrier:
-      # Wait for other workers to reach the end, so this worker doesn't
-      # go away underneath them.
-      sess.run([graph_info.execution_barrier])
-    stats = {
-        'num_workers': self.num_workers,
-        'num_steps': num_steps,
-        'average_wall_time': average_wall_time,
-        'images_per_sec': images_per_sec
-    }
-    if last_average_loss is not None:
-      stats['last_average_loss'] = last_average_loss
-    if accuracy_at_1 is not None:
-      stats['top_1_accuracy'] = accuracy_at_1
-    if accuracy_at_5 is not None:
-      stats['top_5_accuracy'] = accuracy_at_5
-
-    success = bool(self.model.reached_target() or
-                   (accuracy_at_1 and self.params.stop_at_top_1_accuracy and
-                    accuracy_at_1 >= self.params.stop_at_top_1_accuracy))
-    mlperf.logger.log(key=mlperf.tags.RUN_STOP, value={'success': success})
-    mlperf.logger.log(key=mlperf.tags.RUN_FINAL)
-    return stats
-
-  def _should_eval_during_training(self, step):
-    """Return True iff should run eval during training at current step."""
-
-    assert self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL
-
-    if self.params.eval_during_training_every_n_steps:
-      return step % self.params.eval_during_training_every_n_steps == 0
-
-    # All other --eval_during_training_* flags are converted to step numbers
-    # at which the model should run evaluation during training.
-    return step in self.eval_during_training_at_specified_steps
-
-  def _preprocess_graph(self, graph, graph_info):
-    """Preprocess the graph before executing.
-
-    Depending on the params, it runs various preprocessing on the graph,
-    including freezing, TensorRT conversion, etc.
-
-    Args:
-      graph: the graph to preprocess.
-      graph_info: the namedtuple returned by _build_graph() which
-        contains all necessary information to benchmark the graph, including
-        named tensors/ops list, fetches, etc.
-
-    Returns:
-      The updated graph and graph_info with the ops/tensors/fetches updated
-      according to the imported graph.
-    """
-    assert isinstance(graph_info.fetches, dict)
-    assert isinstance(graph_info.global_step, tf.Variable)
-    if not self.forward_only_and_freeze:
-      return (graph, graph_info)
-
-    # Get the names of the ops that need to keep during conversion.
-    flattened_op_names = list(
-        set([
-            v.name.split(':')[0]
-            for v in nest.flatten(graph_info)
-            if v is not None
-        ]))
-    # Get variables that we don't want to freeze.
-    # Only keep unfreezable variables in forward_only_and_freeze mode.
-    # TODO(laigd): consider making global_step a constant.
-    variables_to_keep = {graph_info.global_step: tf.GraphKeys.GLOBAL_VARIABLES}
-    variables_to_keep.update({
-        local_variable: tf.GraphKeys.LOCAL_VARIABLES
-        for local_variable in self._unfreezable_local_variables(graph)
-    })
-
-    variable_initializers = [
-        variable.initializer.name for variable in variables_to_keep]
-    output_node_names = (
-        flattened_op_names +
-        # Add variable initializer and read ops to the output list, so
-        # convert_variables_to_constants() will keep them.
-        variable_initializers +
-        [variable.value().op.name for variable in variables_to_keep])
-    graphdef = graph.as_graph_def(add_shapes=True)
-
-    # Freeze the graph.
-    with graph.as_default():
-      with tf.Session(config=create_config_proto(self.params)) as sess:
-        sess.run(tf.global_variables_initializer())
-        sess.run(tf.local_variables_initializer())
-        graphdef = graph_util.convert_variables_to_constants(
-            sess,
-            graphdef,
-            output_node_names,
-            variable_names_blacklist=[
-                variable.op.name for variable in variables_to_keep
-            ])
-
-    # Run TensorRT conversion.
-    if self.params.trt_mode:
-      # Import here instead of at top, because this will crash if TensorRT is
-      # not installed
-      from tensorflow.python.compiler.tensorrt import trt_convert  # pylint: disable=g-import-not-at-top
-      # Avoid TF-TRT bridge from touching all variable initializer ops and their
-      # dependencies, since they can directly be fetched by sess.run()s that
-      # initialize the variables.
-      # pylint: disable=protected-access
-      name_to_input_name, _, _ = graph_util_impl._extract_graph_summary(
-          graphdef)
-      initializer_subgraph_ops = graph_util_impl._bfs_for_reachable_nodes(
-          variable_initializers, name_to_input_name)
-      # pylint: enable=protected-access
-
-      graphdef = trt_convert.create_inference_graph(
-          graphdef,
-          outputs=output_node_names + list(initializer_subgraph_ops),
-          max_batch_size=self.model.get_batch_size(),
-          max_workspace_size_bytes=self.params.trt_max_workspace_size_bytes,
-          precision_mode=self.params.trt_mode)
-
-    # Creates a new graph as the default and import the converted graph back.
-    updated_graph = tf.Graph()
-
-    def _get_tensors_or_ops(inputs):
-      """Gets the updated tensors or ops from 'updated_graph'."""
-
-      def _get_fn(element):
-        if element is None:
-          return None
-        if ':' in element.name:
-          return updated_graph.get_tensor_by_name(element.name)
-        return updated_graph.get_operation_by_name(element.name)
-
-      if isinstance(inputs, (list, dict, tuple)):
-        return nest.map_structure(_get_fn, inputs)
-      else:
-        return _get_fn(inputs)
-
-    with updated_graph.as_default():
-      importer.import_graph_def(graph_def=graphdef, name='')
-
-      # Update the variables
-      for variable in variables_to_keep:
-        updated_variable = tf.Variable.from_proto(variable.to_proto())
-        tf.add_to_collection(variables_to_keep[variable], updated_variable)
-        if variable is graph_info.global_step:
-          updated_global_step = updated_variable
-
-    updated_graph_info = GraphInfo(
-        input_producer_op=_get_tensors_or_ops(graph_info.input_producer_op),
-        enqueue_ops=_get_tensors_or_ops(graph_info.enqueue_ops),
-        execution_barrier=_get_tensors_or_ops(graph_info.execution_barrier),
-        local_var_init_op_group=_get_tensors_or_ops(
-            graph_info.local_var_init_op_group),
-        fetches=_get_tensors_or_ops(graph_info.fetches),
-        global_step=updated_global_step,
-        summary_op=None)
-    return (updated_graph, updated_graph_info)
-
-  def _build_input_processing(self, shift_ratio=0):
-    """"Build the image (pre)processing portion of the model graph.
-
-    Args:
-      shift_ratio: shift_ratio for data_flow_ops.RecordInput.
-
-    Returns:
-      An InputProcessingInfo containing all the input sources to the model.
-    """
-    input_processing_info = InputProcessingInfo(
-        input_producer_op=None,
-        input_producer_stages=None,
-        multi_device_iterator_input=None)
-
-    mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
-    if not self._doing_eval:
-      mlperf.logger.log(key=mlperf.tags.INPUT_BATCH_SIZE, value=self.batch_size)
-
-    # If using synthetic gpu inputs, do nothing on the cpu side.
-    if self.dataset.use_synthetic_gpu_inputs():
-      assert not self.datasets_use_prefetch
-      return input_processing_info
-
-    if self._doing_eval:
-      input_preprocessor = self.eval_input_preprocessor
-      mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_EVAL_EXAMPLES,
-                        value=self.dataset.num_examples_per_epoch('validation'))
-    else:
-      input_preprocessor = self.input_preprocessor
-      mlperf.logger.log(key=mlperf.tags.PREPROC_NUM_TRAIN_EXAMPLES,
-                        value=self.dataset.num_examples_per_epoch('train'))
-
-    # Use prefetching mechanism provided by dataset input pipeline.
-    if self.datasets_use_prefetch:
-      multi_device_iterator = (
-          input_preprocessor.build_multi_device_iterator(
-              self.batch_size, len(self.devices), self.cpu_device, self.params,
-              self.raw_devices, self.dataset, self._doing_eval))
-      return input_processing_info._replace(
-          multi_device_iterator_input=multi_device_iterator.get_next())
-
-    # Not using dataset prefetching. Use a staging area to mimic the prefetching
-    # behavior instead.
-    with tf.device(self.cpu_device):
-      if self._doing_eval:
-        subset = 'validation'
-      else:
-        subset = 'train'
-      input_list = input_preprocessor.minibatch(
-          self.dataset,
-          subset=subset,
-          params=self.params,
-          shift_ratio=shift_ratio)
-
-      input_producer_op = []
-      input_producer_stages = []
-      for device_num in range(len(self.devices)):
-        staging_area = data_flow_ops.StagingArea(
-            [parts[0].dtype for parts in input_list],
-            shapes=[parts[0].get_shape() for parts in input_list],
-            shared_name='input_producer_staging_area_%d_eval_%s' %
-            (device_num, self._doing_eval))
-        input_producer_stages.append(staging_area)
-        for group_index in xrange(self.batch_group_size):
-          batch_index = group_index + device_num * self.batch_group_size
-          put_op = staging_area.put(
-              [parts[batch_index] for parts in input_list])
-          input_producer_op.append(put_op)
-      assert input_producer_op
-
-    return input_processing_info._replace(
-        input_producer_op=input_producer_op,
-        input_producer_stages=input_producer_stages)
-
-  def _maybe_initialize_fp16(self):
-    """Initialize fp16 settings."""
-    if self.params.use_fp16 and not self._doing_eval:
-      init_loss_scale_val = float(self.params.fp16_loss_scale or
-                                  self.model.get_fp16_loss_scale())
-      self.loss_scale = None
-      self.loss_scale_normal_steps = None
-      if self.enable_auto_loss_scale or init_loss_scale_val != 1:
-        self.loss_scale = tf.get_variable(
-            name='loss_scale',
-            initializer=init_loss_scale_val,
-            dtype=tf.float32,
-            trainable=False)
-      if self.enable_auto_loss_scale:
-        self.loss_scale_normal_steps = tf.get_variable(
-            name='loss_scale_normal_steps', initializer=0, trainable=False)
-
-  def _build_model(self):
-    """Build the TensorFlow graph."""
-    if self.datasets_use_prefetch:
-      assert not self.params.staged_vars
-      assert not self.variable_mgr.supports_staged_vars()
-
-    # Adjust seed so different workers start read different input files.
-    if self.params.variable_update == 'horovod':
-      import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-      seed_adjustment = hvd.rank()
-    else:
-      seed_adjustment = 0
-    mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
-                      value=self.params.tf_random_seed + seed_adjustment)
-    tf.set_random_seed(self.params.tf_random_seed + seed_adjustment)
-    mlperf.logger.log(key=mlperf.tags.RUN_SET_RANDOM_SEED,
-                      value=4321 + seed_adjustment)
-    np.random.seed(4321 + seed_adjustment)
-    phase_train = not (self._doing_eval or self.params.forward_only)
-
-    if self._doing_eval:
-      mode_string = 'evaluation'
-    else:
-      mode_string = 'training'
-
-    log_fn('Generating {} model'.format(mode_string))
-    losses = []
-    device_grads = []
-    all_logits = []
-    all_accuracy_ops = {}
-    gpu_compute_stage_ops = []
-    gpu_grad_stage_ops = []
-
-    with tf.device(self.global_step_device):
-      global_step = tf.train.get_or_create_global_step()
-      self._maybe_initialize_fp16()
-
-    # Build the processing and model for the worker.
-    input_producer_op = None
-    with tf.name_scope('input_processing'):
-      input_processing_info = self._build_input_processing(shift_ratio=0)
-      if input_processing_info.input_producer_op is not None:
-        input_producer_op = tf.group(*input_processing_info.input_producer_op)
-    update_ops = None
-    staging_delta_ops = []
-
-    for device_num in range(len(self.devices)):
-      with tf.name_scope('tower_%i' % device_num) as name_scope, (
-          self.variable_mgr.create_outer_variable_scope(device_num)):
-        results = self.add_forward_pass_and_gradients(
-            phase_train, device_num, device_num, input_processing_info,
-            gpu_compute_stage_ops, gpu_grad_stage_ops)
-
-        if self.params.backbone_model_path:
-          self.model.add_backbone_saver()
-
-        if phase_train:
-          losses.append(results['loss'])
-          device_grads.append(results['gradvars'])
-        else:
-          all_logits.append(results['logits'])
-        if not phase_train or self.params.print_training_accuracy:
-          for name, op in results.items():
-            if name.startswith('accuracy:'):
-              key = name[9:]
-              if key not in all_accuracy_ops:
-                all_accuracy_ops[key] = []
-              all_accuracy_ops[key].append(op)
-
-        if device_num == 0:
-          # Retain the Batch Normalization updates operations only from the
-          # first tower. These operations update the moving mean and moving
-          # variance variables, which are updated (but not used) during
-          # training, and used during evaluation. The moving mean and variance
-          # approximate the true mean and variance across all images in the
-          # dataset. Therefore, in replicated mode, these moving averages would
-          # be almost identical for each tower, and so we only update and save
-          # the moving averages for one tower. In parameter server mode, all
-          # towers share a copy of the variables so we also only need to update
-          # and save the moving averages once.
-          update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope)
-          if self.datasets_use_prefetch:
-            assert not self.variable_mgr.staging_delta_ops
-          else:
-            staging_delta_ops = list(self.variable_mgr.staging_delta_ops)
-
-    enqueue_ops = []
-    if not self.datasets_use_prefetch:
-      if self.variable_mgr.supports_staged_vars():
-        for staging_ops in self.variable_mgr.staging_vars_on_devices:
-          gpu_compute_stage_ops.extend(
-              [put_op for _, (put_op, _) in six.iteritems(staging_ops)])
-      enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
-                                  name='gpu_compute_stage_ops_group'))
-      if gpu_grad_stage_ops:
-        staging_delta_ops += gpu_grad_stage_ops
-      if staging_delta_ops:
-        enqueue_ops.append(tf.group(*(staging_delta_ops)))
-
-    if (self.mode == constants.BenchmarkMode.TRAIN_AND_EVAL and
-        self.params.variable_update == 'replicated'):
-      # We need to get all the update ops instead of only those for the first
-      # tower. This is because during evaluation, each tower will read from its
-      # own tower's moving averages instead of the first tower's moving
-      # averages.
-      # TODO(reedwm): Have each tower read from the first tower's moving
-      # averages for a slight performance gain.
-      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-      mlperf.logger.log(key=mlperf.tags.INPUT_BN_SPAN,
-                        value=self.batch_size // len(self.raw_devices))
-
-    fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
-                                  enqueue_ops, update_ops, all_accuracy_ops,
-                                  phase_train)
-    return (input_producer_op, enqueue_ops, fetches)
-
-  def _build_fetches(self, global_step, all_logits, losses, device_grads,
-                     enqueue_ops, update_ops, all_accuracy_ops, phase_train):
-    """Complete construction of model graph, populating the fetches map."""
-    fetches = {}
-    if enqueue_ops:
-      fetches['enqueue_ops'] = enqueue_ops
-    for name, ops in all_accuracy_ops.items():
-      # For fetches that starts with 'tensor:', keep dimension and skip reducing
-      # them to scalars.
-      if name.startswith(constants.UNREDUCED_ACCURACY_OP_PREFIX):
-        key = name[len(constants.UNREDUCED_ACCURACY_OP_PREFIX):]
-        fetches[key] = tf.concat(ops, 0)
-      else:
-        fetches[name] = (
-            tf.reduce_sum(ops) /
-            (self.batch_size *
-             (self.num_workers if self.single_session else 1)))
-        if self.task_index == 0 and self.params.summary_verbosity >= 1:
-          tf.summary.scalar(name, fetches[name])
-
-    if not phase_train:
-      if self.params.forward_only:
-        fetches['all_logits'] = tf.concat(all_logits, 0)
-      return fetches
-    apply_gradient_devices, gradient_state = (
-        self.variable_mgr.preprocess_device_grads(device_grads))
-
-    # TODO(reedwm): Greatly simplify the learning rate code.
-    if (self.params.variable_update == 'horovod' or
-        self.params.variable_update == 'collective_all_reduce'):
-      # Each worker independently increments global_step.
-      examples_per_step = self.batch_size * self.num_workers
-    else:
-      # global_step is shared by all workers, and so every iteration
-      # global_step is incremented by num_workers.
-      examples_per_step = self.batch_size
-    if self.params.compute_lr_on_cpu:
-      with tf.device(self.cpu_device):
-        learning_rate = get_learning_rate(self.params, global_step,
-                                          self.dataset.num_examples_per_epoch(),
-                                          self.model, examples_per_step)
-
-    training_ops = []
-    for d, device in enumerate(apply_gradient_devices):
-      with tf.device(device):
-        with tf.name_scope('average_loss'):
-          average_loss = tf.reduce_mean(losses)
-        with tf.name_scope('get_gradients_to_apply'):
-          avg_grads = self.variable_mgr.get_gradients_to_apply(d,
-                                                               gradient_state)
-
-        if not self.params.compute_lr_on_cpu:
-          # We compute the learning rate once for each device in
-          # `apply_gradient_devices`.
-          learning_rate = get_learning_rate(
-              self.params, global_step, self.dataset.num_examples_per_epoch(),
-              self.model, examples_per_step)
-        gradient_clip = self.params.gradient_clip
-        if gradient_clip is not None:
-          with tf.name_scope('clip_gradients'):
-            clipped_grads = [(tf.clip_by_value(grad, -gradient_clip,
-                                               +gradient_clip), var)
-                             for grad, var in avg_grads]
-        else:
-          clipped_grads = avg_grads
-
-        learning_rate = tf.identity(learning_rate, name='learning_rate_tensor')
-        opt = get_optimizer(self.params, learning_rate)
-        loss_scale_params = variable_mgr_util.AutoLossScaleParams(
-            enable_auto_loss_scale=self.enable_auto_loss_scale,
-            loss_scale=self.loss_scale,
-            loss_scale_normal_steps=self.loss_scale_normal_steps,
-            inc_loss_scale_every_n=self.params.fp16_inc_loss_scale_every_n,
-            is_chief=not self.job_name or self.task_index == 0)
-
-        with tf.name_scope('append_apply_gradient_ops'):
-          self.variable_mgr.append_apply_gradients_ops(
-              gradient_state, opt, clipped_grads, training_ops,
-              loss_scale_params)
-    train_op = tf.group(*(training_ops + update_ops), name='train_ops_group')
-
-    with tf.device(self.cpu_device):
-      if self.task_index == 0 and self.params.summary_verbosity >= 1:
-        tf.summary.scalar('learning_rate', learning_rate)
-        tf.summary.scalar(self.params.loss_type_to_report, average_loss)
-        if self.loss_scale is not None:
-          tf.summary.scalar('loss_scale', self.loss_scale)
-        if self.loss_scale_normal_steps:
-          tf.summary.scalar('loss_scale_normal_steps',
-                            self.loss_scale_normal_steps)
-
-        if self.params.summary_verbosity >= 2:
-          self.gradient_histogram_summary(avg_grads)
-
-        if self.params.summary_verbosity >= 3:
-          for grad, var in avg_grads:
-            if grad is not None:
-              tf.summary.histogram(var.op.name + '/gradients', grad)
-          for var in tf.trainable_variables():
-            tf.summary.histogram(var.op.name, var)
-
-    fetches['train_op'] = train_op
-    fetches['average_loss'] = average_loss
-    return fetches
-
-  def gradient_histogram_summary(self, avg_grads):
-    """Create histogram of log values of all non-zero gradients."""
-    with tf.name_scope('log_gradients_summary'):
-      all_grads = []
-      for grad, _ in avg_grads:
-        all_grads.append(tf.reshape(grad, [-1]))
-      grads = tf.abs(tf.concat(all_grads, 0))
-      # exclude grads with zero values.
-      indices_for_non_zero_grads = tf.where(tf.not_equal(grads, 0))
-      log_grads = tf.reshape(
-          tf.log(tf.gather(grads, indices_for_non_zero_grads)), [-1])
-      tf.summary.histogram('log_gradients', log_grads)
-
-  def _build_model_single_session(self):
-    """Build the TensorFlow graph for multiple replicas in a single_session.
-
-    Returns:
-      input_producer_op:
-      enqueue_ops:
-      fetches:
-
-    Raises:
-       ValueError: optimizer not recognized.
-
-    Single session runs multiple model replicas as part of one large
-    distributed graph, whose global execution is always step-synchronized.
-    """
-    # verify assumptions
-    assert self.params.task_index == 0
-    assert not self._doing_eval
-    assert not self.params.forward_only
-    assert not self.params.staged_vars
-
-    tf.set_random_seed(self.params.tf_random_seed)
-    np.random.seed(4321)
-    phase_train = True
-
-    log_fn('Generating training model')
-    losses = []
-    device_grads = []
-    all_logits = []
-    all_accuracy_ops = {}
-    gpu_compute_stage_ops = []
-    gpu_grad_stage_ops = []
-
-    with tf.device(self.global_step_device):
-      global_step = tf.train.get_or_create_global_step()
-
-    update_ops = []
-    global_input_producer_op = []
-
-    is_local = not self.job_name
-    if is_local:
-      assert self.num_workers == 1
-    for task_num in range(self.num_workers):
-      # Reset the devices that self.variable_mgr knows about to those
-      # belonging to the next worker (task).
-      self.reset_devices_for_task(task_num, is_local)
-      # Build the per-worker image processing
-      with tf.name_scope('input_processing'):
-        input_processing_info = self._build_input_processing(
-            shift_ratio=(task_num / self.num_workers))
-      if input_processing_info.input_producer_op is not None:
-        global_input_producer_op.extend(input_processing_info.input_producer_op)
-      # Build the per-worker model replica.
-      for rel_device_num in range(len(self.devices)):
-        abs_device_num = task_num * len(self.devices) + rel_device_num
-        with self.variable_mgr.create_outer_variable_scope(
-            abs_device_num), tf.name_scope(
-                'task_%i_tower_%i' % (task_num, rel_device_num)) as name_scope:
-          task_results = self.add_forward_pass_and_gradients(
-              phase_train, rel_device_num, abs_device_num,
-              input_processing_info, gpu_compute_stage_ops, gpu_grad_stage_ops)
-
-          if self.params.backbone_model_path:
-            self.model.add_backbone_saver()
-
-          if phase_train:
-            losses.append(task_results['loss'])
-            device_grads.append(task_results['gradvars'])
-          else:
-            all_logits.append(task_results['logits'])
-          if not phase_train or self.params.print_training_accuracy:
-            for name, op in task_results.items():
-              if name.startswith('accuracy:'):
-                key = name[9:]
-                if key not in all_accuracy_ops:
-                  all_accuracy_ops[key] = []
-                all_accuracy_ops[key].append(op)
-
-          if rel_device_num == 0:
-            # Retain the Batch Normalization updates operations only
-            # from the first tower. These operations update the moving
-            # mean and moving variance variables, which are updated
-            # (but not used) during training, and used during
-            # evaluation. The moving mean and variance approximate the
-            # true mean and variance across all images in the
-            # dataset. Therefore, in replicated mode, these moving
-            # averages would be almost identical for each tower, and
-            # so we only update and save the moving averages for one
-            # tower. In parameter server mode, all towers share a copy
-            # of the variables so we also only need to update and save
-            # the moving averages once.
-            update_ops.extend(
-                tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope))
-            assert not self.variable_mgr.staging_delta_ops
-
-    enqueue_ops = []
-    if gpu_compute_stage_ops:
-      enqueue_ops.append(tf.group(*gpu_compute_stage_ops,
-                                  name='gpu_compute_stage_ops'))
-    assert not self.variable_mgr.supports_staged_vars()
-    assert not gpu_grad_stage_ops
-
-    fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
-                                  enqueue_ops, update_ops, all_accuracy_ops,
-                                  phase_train)
-    if global_input_producer_op:
-      global_input_producer_op = tf.group(*global_input_producer_op)
-    else:
-      global_input_producer_op = None
-    return (global_input_producer_op, enqueue_ops, fetches)
-
-  def add_forward_pass_and_gradients(self,
-                                     phase_train,
-                                     rel_device_num,
-                                     abs_device_num,
-                                     input_processing_info,
-                                     gpu_compute_stage_ops,
-                                     gpu_grad_stage_ops):
-    """Add ops for forward-pass and gradient computations."""
-    nclass = self.dataset.num_classes
-    if self.datasets_use_prefetch:
-      assert input_processing_info.multi_device_iterator_input, (
-          'multi_device_iterator_input cannot be None if '
-          'datasets_use_prefetch=True')
-      input_list = (
-          input_processing_info.multi_device_iterator_input[rel_device_num])
-    else:
-      if not self.dataset.use_synthetic_gpu_inputs():
-        input_producer_stage = input_processing_info.input_producer_stages[
-            rel_device_num]
-        with tf.device(self.cpu_device):
-          host_input_list = input_producer_stage.get()
-        with tf.device(self.raw_devices[rel_device_num]):
-          gpu_compute_stage = data_flow_ops.StagingArea(
-              [inp.dtype for inp in host_input_list],
-              shapes=[inp.get_shape() for inp in host_input_list])
-          # The CPU-to-GPU copy is triggered here.
-          gpu_compute_stage_op = gpu_compute_stage.put(host_input_list)
-          input_list = gpu_compute_stage.get()
-          gpu_compute_stage_ops.append(gpu_compute_stage_op)
-      else:
-        with tf.device(self.raw_devices[rel_device_num]):
-          # Minor hack to avoid H2D copy when using synthetic data
-          input_list = self.model.get_synthetic_inputs(
-              BenchmarkCNN.GPU_CACHED_INPUT_VARIABLE_NAME, nclass)
-
-    # Labels reshaping happens all on gpu:0. Reshaping synthetic labels on
-    # multiple devices slows down XLA computation for an unknown reason.
-    # TODO(b/116875203): Find/address root cause of XLA slow down.
-    labels_device_placement_hack = (
-        self.dataset.use_synthetic_gpu_inputs() and self.params.xla_compile)
-
-    def device_aware_reshape(tensor, shape):
-      device = self.devices[rel_device_num]
-      # Labels are int32, place reshapes on gpu:0 (no device placement) when the
-      # hack is enabled.
-      if labels_device_placement_hack and tensor.dtype == tf.int32:
-        device = ''
-      with tf.device(device):
-        return tf.reshape(tensor, shape=shape)
-
-    subset = 'validation' if self._doing_eval else 'train'
-    input_shapes = self.model.get_input_shapes(subset)
-    input_list = [
-        device_aware_reshape(input_list[i], shape=input_shapes[i])
-        for i in range(len(input_list))
-    ]
-
-    def forward_pass_and_gradients():
-      """Builds forward pass and gradient computation network.
-
-      When phase_train=True and print_training_accuracy=False:
-        return [loss] + grads
-
-      When phase_train=True and print_training_accuracy=True:
-        return [logits, loss] + grads
-
-      When phase_train=False,
-        return [logits]
-
-      Its output can always be unpacked by
-
-      ```
-        outputs = forward_pass_and_gradients()
-        logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
-      ```
-
-      Returns:
-        outputs: A list of tensors depending on different modes.
-      """
-
-      build_network_result = self.model.build_network(
-          input_list, phase_train, nclass)
-      logits = build_network_result.logits
-
-      if not phase_train:
-        return [logits]
-
-      base_loss = self.model.loss_function(input_list, build_network_result)
-      params = self.variable_mgr.trainable_variables_on_device(
-          rel_device_num, abs_device_num)
-      l2_loss = None
-      total_loss = base_loss
-      with tf.name_scope('l2_loss'):
-        fp32_params = params
-        if self.model.data_type == tf.float16 and self.params.fp16_vars:
-          # fp16 reductions are very slow on GPUs, so cast to fp32 before
-          # calling tf.nn.l2_loss and tf.add_n.
-          # TODO(b/36217816): Once the bug is fixed, investigate if we should do
-          # this reduction in fp16.
-          fp32_params = (tf.cast(p, tf.float32) for p in params)
-        filtered_params = self.model.filter_l2_loss_vars(fp32_params)
-        if rel_device_num == len(self.devices) - 1:
-          # We compute the L2 loss for only one device instead of all of them,
-          # because the L2 loss for each device is the same. To adjust for this,
-          # we multiply the L2 loss by the number of devices. We choose the
-          # last device because for some reason, on a Volta DGX1, the first four
-          # GPUs take slightly longer to complete a step than the last four.
-          # TODO(reedwm): Shard the L2 loss computations across GPUs.
-          if self.params.single_l2_loss_op:
-            # TODO(reedwm): If faster, create a fused op that does the L2 loss
-            # on multiple tensors, and use that instead of concatenating
-            # tensors.
-            reshaped_params = [tf.reshape(p, (-1,)) for p in filtered_params]
-            l2_loss = tf.nn.l2_loss(tf.concat(reshaped_params, axis=0))
-          else:
-            l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in filtered_params])
-      weight_decay = self.params.weight_decay
-      mlperf.logger.log(key=mlperf.tags.OPT_WEIGHT_DECAY, value=weight_decay)
-      if (weight_decay is not None and weight_decay != 0. and
-          l2_loss is not None):
-        mlperf.logger.log(key=mlperf.tags.MODEL_L2_REGULARIZATION,
-                          value=weight_decay)
-        total_loss += len(self.devices) * weight_decay * l2_loss
-
-      aggmeth = tf.AggregationMethod.DEFAULT
-      scaled_loss = (total_loss if self.loss_scale is None
-                     else total_loss * self.loss_scale)
-      grads = tf.gradients(scaled_loss, params, aggregation_method=aggmeth)
-      if self.params.sparse_to_dense_grads:
-        # Passing a sparse gradient to convert_to_tensor turns it into a dense
-        # gradient. A sparse gradient is an instance of tf.IndexedSlices.
-        # convert_to_tensor does not modify dense tensors.
-        grads = [tf.convert_to_tensor(g) for g in grads]
-      if self.loss_scale is not None:
-        # TODO(reedwm): If automatic loss scaling is not used, we could avoid
-        # these multiplications by directly modifying the learning rate instead.
-        # If this is done, care must be taken to ensure that this scaling method
-        # is correct, as some optimizers square gradients and do other
-        # operations which might not be compatible with modifying both the
-        # gradients and the learning rate.
-
-        grads = [
-            grad * tf.cast(1. / self.loss_scale, grad.dtype) for grad in grads
-        ]
-
-      if self.params.variable_update == 'horovod':
-        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-        if self.params.horovod_device:
-          horovod_device = '/%s:0' % self.params.horovod_device
-        else:
-          horovod_device = ''
-        # All-reduce gradients using Horovod.
-        grads = [hvd.allreduce(grad, average=False, device_dense=horovod_device)
-                 for grad in grads]
-
-      if self.params.staged_vars:
-        grad_dtypes = [grad.dtype for grad in grads]
-        grad_shapes = [grad.shape for grad in grads]
-        grad_stage = data_flow_ops.StagingArea(grad_dtypes, grad_shapes)
-        grad_stage_op = grad_stage.put(grads)
-        # In general, this decouples the computation of the gradients and
-        # the updates of the weights.
-        # During the pipeline warm up, this runs enough training to produce
-        # the first set of gradients.
-        gpu_grad_stage_ops.append(grad_stage_op)
-        grads = grad_stage.get()
-
-      if self.params.loss_type_to_report == 'total_loss':
-        loss = total_loss
-      else:
-        loss = base_loss
-
-      if self.params.print_training_accuracy:
-        return [logits, loss] + grads
-      else:
-        return [loss] + grads
-
-    def unpack_forward_pass_and_gradients_output(forward_pass_and_grad_outputs):
-      """Unpacks outputs from forward_pass_and_gradients.
-
-      Args:
-        forward_pass_and_grad_outputs: Output from forward_pass_and_gradients.
-
-      Returns:
-        logits: Unscaled probability distribution from forward pass.
-          If unavailable, None is returned.
-        loss: Loss function result from logits.
-          If unavailable, None is returned.
-        grads: Gradients for all trainable variables.
-          If unavailable, None is returned.
-      """
-      logits = None
-      # logits is only fetched in non-train mode or when
-      # print_training_accuracy is set.
-      if not phase_train or self.params.print_training_accuracy:
-        logits = forward_pass_and_grad_outputs.pop(0)
-
-      loss = (
-          forward_pass_and_grad_outputs[0]
-          if forward_pass_and_grad_outputs else None)
-      grads = (
-          forward_pass_and_grad_outputs[1:]
-          if forward_pass_and_grad_outputs else None)
-
-      return logits, loss, grads
-
-    def make_results(logits, loss, grads):
-      """Generate results based on logits, loss and grads."""
-      results = {}  # The return value
-
-      if logits is not None:
-        results['logits'] = logits
-        accuracy_ops = self.model.accuracy_function(input_list, logits)
-        for name, op in accuracy_ops.items():
-          results['accuracy:' + name] = op
-
-      if loss is not None:
-        results['loss'] = loss
-
-      if grads is not None:
-        param_refs = self.variable_mgr.trainable_variables_on_device(
-            rel_device_num, abs_device_num, writable=True)
-        results['gradvars'] = list(zip(grads, param_refs))
-
-      return results
-
-    with tf.device(self.devices[rel_device_num]):
-      outputs = maybe_compile(forward_pass_and_gradients, self.params)
-      logits, loss, grads = unpack_forward_pass_and_gradients_output(outputs)
-      return make_results(logits, loss, grads)
-
-  def get_input_preprocessor(self):
-    """Returns the image preprocessor to used, based on the model.
-
-    Returns:
-      The image preprocessor, or None if synthetic data should be used.
-    """
-    shift_ratio = 0
-    if self.job_name:
-      # shift_ratio prevents multiple workers from processing the same batch
-      # during a step
-      shift_ratio = self.task_index / self.num_workers
-
-    processor_class = self.dataset.get_input_preprocessor(
-        self.params.input_preprocessor)
-    assert processor_class
-    subset = 'validation' if self._doing_eval else 'train'
-    return processor_class(
-        self.batch_size * self.batch_group_size,
-        self.model.get_input_shapes(subset),
-        len(self.devices) * self.batch_group_size,
-        dtype=self.model.data_type,
-        train=(not self._doing_eval),
-        # TODO(laigd): refactor away image model specific parameters.
-        distortions=self.params.distortions,
-        resize_method=self.resize_method,
-        shift_ratio=shift_ratio,
-        summary_verbosity=self.params.summary_verbosity,
-        distort_color_in_yiq=self.params.distort_color_in_yiq,
-        fuse_decode_and_crop=self.params.fuse_decode_and_crop,
-        match_mlperf=self.params.ml_perf)
-
-  def add_sync_queues_and_barrier(self, name_prefix, enqueue_after_list):
-    """Adds ops to enqueue on all worker queues.
-
-    Args:
-      name_prefix: prefixed for the shared_name of ops.
-      enqueue_after_list: control dependency from ops.
-
-    Returns:
-      An op that should be used as control dependency before starting next step.
-    """
-    self.sync_queue_counter += 1
-    with tf.device(self.sync_queue_devices[(
-        self.sync_queue_counter % len(self.sync_queue_devices))]):
-      sync_queues = [
-          tf.FIFOQueue(self.num_workers, [tf.bool], shapes=[[]],
-                       shared_name='%s%s' % (name_prefix, i))
-          for i in range(self.num_workers)]
-      queue_ops = []
-      # For each other worker, add an entry in a queue, signaling that it can
-      # finish this step.
-      token = tf.constant(False)
-      with tf.control_dependencies(enqueue_after_list):
-        for i, q in enumerate(sync_queues):
-          if i == self.task_index:
-            queue_ops.append(tf.no_op())
-          else:
-            queue_ops.append(q.enqueue(token))
-
-      # Drain tokens off queue for this worker, one for each other worker.
-      queue_ops.append(
-          sync_queues[self.task_index].dequeue_many(len(sync_queues) - 1))
-
-      return tf.group(*queue_ops)
-
-
-def _is_mkl_flag_absent(mkl_flag):
-  return not (absl_flags.FLAGS.is_parsed() and mkl_flag in absl_flags.FLAGS
-              and absl_flags.FLAGS[mkl_flag].present)
-
-
-def _print_os_env_ignored_warning(mkl_flag, flag_default_val, os_env_var):
-  tf.logging.warn(
-      ('OS ENV variable %s=%s is ignored and script default: '
-       '%s is used. Use --%s to override.') %
-      (os_env_var, os.environ[os_env_var], flag_default_val, mkl_flag))
-
-
-def set_default_param_values_and_env_vars(params):
-  """Sets up the default param values and environment variables ."""
-  if params.batchnorm_persistent:
-    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
-  else:
-    os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
-  if params.winograd_nonfused:
-    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
-  else:
-    os.environ.pop('TF_ENABLE_WINOGRAD_NONFUSED', None)
-  if params.autotune_threshold:
-    os.environ['TF_AUTOTUNE_THRESHOLD'] = str(params.autotune_threshold)
-  os.environ['TF_SYNC_ON_FINISH'] = str(int(params.sync_on_finish))
-  argparse.ArgumentParser(
-      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-  # Sets environment variables for MKL
-  # If OS ENV vars are overridden by script defaults, a warning msg is printed.
-  if params.mkl:
-    mkl_flags = ['kmp_blocktime', 'kmp_settings', 'kmp_affinity',
-                 'num_intra_threads']
-    for mkl_flag in mkl_flags:
-      os_env_var = mkl_flag.upper()
-      if mkl_flag == 'num_intra_threads':
-        os_env_var = 'OMP_NUM_THREADS'
-      flag_val = str(getattr(params, mkl_flag))
-      if _is_mkl_flag_absent(mkl_flag) and os_env_var in os.environ:
-        _print_os_env_ignored_warning(mkl_flag, flag_val, os_env_var)
-      os.environ[os_env_var] = flag_val
-      if mkl_flag == 'num_intra_threads' and not params.num_intra_threads:
-        os.environ.pop(os_env_var, None)
-
-  # Sets GPU thread settings
-  if params.device.lower() == 'gpu':
-    params = params._replace(gpu_thread_mode=params.gpu_thread_mode.lower())
-    if params.gpu_thread_mode not in ['global', 'gpu_shared', 'gpu_private']:
-      raise ValueError('Invalid gpu_thread_mode: %s' % params.gpu_thread_mode)
-    os.environ['TF_GPU_THREAD_MODE'] = params.gpu_thread_mode
-
-    if params.per_gpu_thread_count and params.gpu_thread_mode == 'global':
-      raise ValueError(
-          'Invalid per_gpu_thread_count with gpu_thread_mode=global: %s' %
-          params.per_gpu_thread_count)
-    # Default to two threads. One for the device compute and the other for
-    # memory copies.
-    per_gpu_thread_count = params.per_gpu_thread_count or 2
-    total_gpu_thread_count = per_gpu_thread_count * params.num_gpus
-
-    if params.gpu_thread_mode == 'gpu_private':
-      os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
-    elif params.gpu_thread_mode == 'gpu_shared':
-      os.environ['TF_GPU_THREAD_COUNT'] = str(total_gpu_thread_count)
-
-    cpu_count = multiprocessing.cpu_count()
-    if not params.num_inter_threads and params.gpu_thread_mode in [
-        'gpu_private', 'gpu_shared'
-    ]:
-      main_thread_count = max(cpu_count - total_gpu_thread_count, 1)
-      params = params._replace(num_inter_threads=main_thread_count)
-
-    if (params.datasets_use_prefetch and
-        params.datasets_num_private_threads is None):
-      # From the total cpu thread count, subtract the total_gpu_thread_count,
-      # and then 2 threads per GPU device for event monitoring and sending /
-      # receiving tensors
-      num_monitoring_threads = 2 * params.num_gpus
-      num_private_threads = max(
-          cpu_count - total_gpu_thread_count - num_monitoring_threads, 1)
-      params = params._replace(datasets_num_private_threads=num_private_threads)
-  return params
-
-
-def setup(params):
-  """Sets up the environment that BenchmarkCNN should run in.
-
-  Args:
-    params: Params tuple, typically created by make_params or
-      make_params_from_flags.
-
-  Returns:
-    A potentially modified params.
-  Raises:
-    ValueError: invalid parames combinations.
-  """
-  # Set up environment variables before doing any other global initialization to
-  # make sure it uses the appropriate environment variables.
-  params = set_default_param_values_and_env_vars(params)
-
-  # horovod needs to be initialized before create_config_proto() call since
-  # it will be used in config generation if enabled.
-  if params.variable_update == 'horovod':
-    import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
-    hvd.init()
-
-  platforms_util.initialize(params, create_config_proto(params))
-
-  if not params.job_name:
-    # Create a dummy session to initialize TF global variables using the input
-    # params. Otherwise, ListDevices function may create global devices using
-    # the default config instead of using the user provided config.
-    #
-    # TODO(hinsu): Find a way to achieve the same for distributed benchmark. It
-    # is not legal to create distributed session after local session. It is also
-    # not possible to create distributed session here as that results in
-    # multiple creation of ClusterManager and Server.
-    with tf.Session(config=create_config_proto(params)) as sess:
-      del sess
-
-  return params
-
-
-def maybe_compile(computation, params):
-  if params and params.xla_compile:
-    return tf.xla.experimental.compile(computation)
-  else:
-    return computation()