new tf branch for dtk21.10.1

ee3997b3 · qianyj · 2795dc1f · ee3997b3 · ee3997b3 · ee3997b3
Commit ee3997b3 authored Apr 15, 2022 by qianyj
20 changed files
--- a/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/perfzero_config_test.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/perfzero_config_test.py
--- a/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/process_info_tracker.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/process_info_tracker.py
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/report_utils.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/report_utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Upload test results."""
+from __future__ import print_function
+import importlib
+import json
+import logging
+import perfzero.utils as utils
+import psutil
+import socket
+from six import u as unicode  # pylint: disable=W0622
+def execute_methods(method_names_str, *args, **kwargs):
+  """Calls a list of method names on given function params.
+  Args:
+    method_names_str: String - Comma-separated module.foo.bar.method strings.
+      This function imports module.foo.bar for each such method and calls it
+      with *args and **kwargs.
+    *args: Function params common to each method.
+    **kwargs: Function params common to each method.
+  Raises:
+    RuntimeError: If any of the invoked methods raised an exception.
+  """
+  if not method_names_str:
+    return
+  errors = []
+  module_methods_list = method_names_str.split(',')
+  for module_method in module_methods_list:
+    try:
+      logging.info('Trying to call %s', module_method)
+      module_path, method_path = module_method.rsplit('.', 1)
+      this_module = importlib.import_module(module_path)
+      logging.info('Found module %s, looking for %s', module_path, method_path)
+      this_method = getattr(this_module, method_path)
+      logging.info('Found method %s', method_path)
+      this_method(*args, **kwargs)
+    except Exception as e:  # pylint: disable=broad-except
+      errors.append(str(e))
+  if errors:
+    raise RuntimeError('\n' + '\n'.join(errors))
+def upload_execution_summary(bigquery_project_name, bigquery_dataset_table_name,
+                             execution_summary):
+  """Upload benchmark summary.
+  Note: Using stream=False has a 1000 per day insert limit per table. Using
+  stream=True, the documented limit is 50K+. With streaming there can be
+  a small and possibly not noticeable delay to seeing the results the BigQuery
+  UI, but there can be a 90 minute more or less delay in the results being part
+  of exports.
+  Note: BigQuery maps unicode() to STRING for python2.  If str is used that is
+  mapped to BYTE.
+  Args:
+    bigquery_project_name: Name of the gcp project.
+    bigquery_dataset_table_name: data_set and table name.
+    execution_summary: benchmark summary dictionary of results.
+  """
+  # pylint: disable=C6204
+  import google.auth
+  from google.cloud import bigquery
+  if not bigquery_project_name:
+    logging.info(
+        'Skipped uploading benchmark result to bigquery because bigquery table name is not set.'
+    )
+    return
+  if not bigquery_dataset_table_name:
+    logging.info(
+        'Skipped uploading benchmark result to bigquery because bigquery project name is not set.'
+    )
+    return
+  credentials = google.auth.default()[0]
+  dataset_name = bigquery_dataset_table_name.split('.')[0]
+  table_name = bigquery_dataset_table_name.split('.')[1]
+  client = bigquery.Client(
+      project=bigquery_project_name, credentials=credentials)
+  benchmark_summary_input = {}
+  for key, value in execution_summary.items():
+    if isinstance(value, dict):
+      benchmark_summary_input[key] = unicode(json.dumps(value))
+    else:
+      benchmark_summary_input[key] = unicode(value)
+  logging.debug('Bigquery input for benchmark_summary table is %s',
+                json.dumps(benchmark_summary_input, indent=2))
+  errors = []
+  # TODO(tobyboyd): Shim to direct results to new table until all jobs
+  # are updated.
+  if 'benchmark_results' in dataset_name:
+    if dataset_name == 'benchmark_results_dev':
+      table_ref = client.dataset('perfzero_dev').table('benchmark_summary')
+      table_obj = client.get_table(table_ref)
+    elif dataset_name == 'benchmark_results':
+      table_ref = client.dataset('perfzero').table('benchmark_summary')
+      table_obj = client.get_table(table_ref)
+  else:
+    table_ref = client.dataset(dataset_name).table(table_name)
+    table_obj = client.get_table(table_ref)
+  errors.extend(client.insert_rows(table_obj, [benchmark_summary_input]))
+  if errors:
+    logging.error(
+        'Failed to upload benchmark result to bigquery due to errors %s',
+        errors)
+  else:
+    logging.info(
+        'Uploaded benchmark result to the table %s of the bigquery project %s.',
+        bigquery_dataset_table_name,
+        bigquery_project_name)
+def build_benchmark_result(raw_benchmark_result, has_exception, trial_id):
+  """Converts test_log.proto format to PerfZero format."""
+  benchmark_result = {}
+  benchmark_result['name'] = raw_benchmark_result['name']
+  benchmark_result['wall_time'] = raw_benchmark_result['wall_time']
+  succeeded = not has_exception
+  extras = []
+  for name in raw_benchmark_result.get('extras', {}):
+    entry = {}
+    entry['name'] = name
+    if 'double_value' in raw_benchmark_result['extras'][name]:
+      entry['value'] = raw_benchmark_result['extras'][name]['double_value']
+    else:
+      entry['value'] = raw_benchmark_result['extras'][name]['string_value']
+    extras.append(entry)
+  metrics = []
+  for metric in raw_benchmark_result.get('metrics', []):
+    value = metric['value']
+    if 'min_value' in metric and metric['min_value'] > value:
+      succeeded = False
+    if 'max_value' in metric and metric['max_value'] < value:
+      succeeded = False
+    metrics.append(metric)
+  benchmark_result['succeeded'] = succeeded
+  benchmark_result['extras'] = extras
+  benchmark_result['metrics'] = metrics
+  benchmark_result['trial_id'] = trial_id
+  return benchmark_result
+def build_execution_summary(execution_timestamp, execution_id,
+                            ml_framework_build_label, execution_label,
+                            platform_name, system_name, output_gcs_url,
+                            benchmark_result, env_vars, flags, harness_info,
+                            site_package_info, process_info, has_exception,
+                            is_tpu_benchmark):
+  """Builds summary of the execution."""
+  # Avoids module not found during setup phase when tf is not installed yet.
+  # pylint: disable=C6204
+  import tensorflow as tf
+  benchmark_info = {}
+  benchmark_info['harness_name'] = 'perfzero'
+  benchmark_info['harness_info'] = harness_info
+  benchmark_info['has_exception'] = has_exception
+  if execution_label:
+    benchmark_info['execution_label'] = execution_label
+  if output_gcs_url:
+    benchmark_info['output_url'] = '{}/{}/'.format(output_gcs_url, execution_id)
+  if env_vars:
+    benchmark_info['env_vars'] = env_vars
+  if flags:
+    benchmark_info['flags'] = flags
+  benchmark_info['site_package_info'] = site_package_info
+  ml_framework_info = {}
+  ml_framework_info['name'] = 'tensorflow'
+  ml_framework_info['version'] = tf.__version__
+  # tf.__git_version__ in Python3 has format b'version_string'
+  if tf.__git_version__[0] == 'b':
+    ml_framework_info['build_version'] = tf.__git_version__[2:-1]
+  else:
+    ml_framework_info['build_version'] = tf.__git_version__
+  if ml_framework_build_label:
+    ml_framework_info['build_label'] = ml_framework_build_label
+  system_info = {}
+  if platform_name:
+    system_info['platform_name'] = platform_name
+  if system_name:
+    system_info['system_name'] = system_name
+  if not is_tpu_benchmark:
+    gpu_info = utils.get_gpu_info()
+    if gpu_info:
+      system_info['accelerator_driver_version'] = gpu_info['gpu_driver_version']
+      system_info['accelerator_model'] = gpu_info['gpu_model']
+      system_info['accelerator_count'] = gpu_info['gpu_count']
+  system_info['cpu_model'] = utils.get_cpu_name()
+  system_info['physical_cpu_count'] = psutil.cpu_count(logical=False)
+  system_info['logical_cpu_count'] = psutil.cpu_count(logical=True)
+  system_info['cpu_socket_count'] = utils.get_cpu_socket_count()
+  system_info['hostname'] = socket.gethostname()
+  execution_summary = {}
+  execution_summary['execution_id'] = execution_id
+  execution_summary['execution_timestamp'] = execution_timestamp
+  execution_summary['benchmark_result'] = benchmark_result
+  execution_summary['benchmark_info'] = benchmark_info
+  execution_summary['setup_info'] = {}
+  execution_summary['ml_framework_info'] = ml_framework_info
+  execution_summary['system_info'] = system_info
+  if process_info:
+    execution_summary['process_info'] = process_info
+  return execution_summary
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tensorflow_profiler.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tensorflow_profiler.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Collect profiler data for Tensorboard with a separate thread."""
+from __future__ import print_function
+import logging
+import os
+import sched
+import threading
+import time
+import traceback
+import perfzero.utils as utils
+def _start_profiler(output_dir):
+  """Start profiler.
+  Args:
+    output_dir: log directory to place the profiler data
+  """
+  import tensorflow as tf  # pylint: disable=g-import-not-at-top
+  profiler_data_dir = os.path.join(output_dir, 'profiler_data')
+  utils.make_dir_if_not_exist(profiler_data_dir)
+  logging.info('Starting TensorFlow profiler and saving data to dir %s',
+                 profiler_data_dir)
+  try:
+    tf.profiler.experimental.start(profiler_data_dir)
+    logging.info('Started TensorFlow profiler')
+  except Exception:  # pylint: disable=broad-except
+    logging.error('TensorFlow profiler failed to start due to error:\n %s',
+                  traceback.format_exc())
+def _stop_profiler():
+  """Stop profiler."""
+  import tensorflow as tf  # pylint: disable=g-import-not-at-top
+  try:
+    tf.profiler.experimental.stop()
+    logging.info('Stopped TensorFlow profiler.')
+  except Exception:  # pylint: disable=broad-except
+    logging.error('TensorFlow profiler failed to stop due to error:\n %s',
+                  traceback.format_exc())
+class TensorFlowProfiler(object):
+  """Collect profiler data for Tensorboard with a separate thread."""
+  def __init__(self, profiler_enabled_time_str, output_dir):
+    """Constructor.
+    Args:
+      profiler_enabled_time_str: the value of the config --profiler_enabled_time
+      output_dir: log directory to place the profiler data
+    """
+    self.profiler_enabled_time_str = profiler_enabled_time_str
+    self.output_dir = output_dir
+    self.exit_event = threading.Event()
+    self.scheduler = sched.scheduler(time.time, self._sleep_until_exit)
+  def _sleep_until_exit(self, timeout):
+    start_time = time.time()
+    cur_time = time.time()
+    while cur_time - start_time < timeout and not self.exit_event.is_set():
+      time.sleep(min(1, timeout + start_time - cur_time))
+      cur_time = time.time()
+  def start(self):
+    """Schedule start/stop profiler event specified in profiler_enabled_time_str."""
+    if not self.profiler_enabled_time_str:
+      return
+    last_end_time = -1
+    for time_str in self.profiler_enabled_time_str.split(','):
+      begin_time = int(time_str.split(':')[0].strip())
+      end_time_str = time_str.split(':')[1].strip() if ':' in time_str else None
+      end_time = int(end_time_str) if end_time_str else 365 * 24 * 60 * 60
+      if begin_time <= last_end_time:
+        raise ValueError('begin_time {} is no larger than the last '
+                         'end_time {}'.format(begin_time, last_end_time))
+      if end_time <= begin_time:
+        raise ValueError('end_time {} is no larger than begin_time {}'.format(
+            end_time, begin_time))
+      # 4th positional arg added to support Python2 for the short-term.
+      self.scheduler.enter(begin_time, 1, _start_profiler,
+        argument=(self.output_dir,))
+      self.scheduler.enter(end_time, 1, _stop_profiler, ())  # pylint: disable=no-value-for-parameter
+      last_end_time = end_time
+    threading.Thread(target=self.scheduler.run).start()
+  def stop(self):
+    """Stop scheduler and save profiler data if any event is cancelled."""
+    event_canceled = False
+    for event in self.scheduler.queue:
+      try:
+        self.scheduler.cancel(event)
+        event_canceled = True
+      except ValueError:
+        # This is OK because the event may have been just canceled
+        pass
+    # Signal the scheduler thread to stop sleeping
+    self.exit_event.set()
+    # Save the profiler data if any event is canceled
+    if event_canceled:
+      _stop_profiler()
--- a/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/test_files/example_nvidia-smi_no_processes.txt
+++ b/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/test_files/example_nvidia-smi_no_processes.txt
--- a/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/test_files/example_nvidia-smi_processes.txt
+++ b/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/test_files/example_nvidia-smi_processes.txt
--- a/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/test_files/nvme_device_log.txt
+++ b/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/test_files/nvme_device_log.txt
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tpu_runtime_utils.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/tpu_runtime_utils.py
+"""Utility to manage the tpu version before starting the benchmark."""
+import json
+from absl import logging
+from six.moves.urllib import request
+try:
+  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
+except ImportError:
+  print(
+      'Falling back to TensorFlow client; we recommended you install the Cloud '
+      'TPU client directly with pip install cloud-tpu-client.')
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
+def _as_text(s):
+  """Converts a byte/string into string."""
+  if isinstance(s, bytes):
+    return s.decode('utf-8')
+  return s
+def _get_content(url):
+  """Opens the url and loads the response into json."""
+  logging.info('opening url %s', url)
+  req = request.Request(url)
+  resp = request.urlopen(req)
+  resp_text = _as_text(resp.read())
+  logging.info('response text = %s', resp_text)
+  return json.loads(resp_text)
+def _get_version_info(url, version_label):
+  """Constructs a version info from the response."""
+  json_data = _get_content(url)
+  logging.info('json_data = %s', json_data)
+  if 'currentVersion' in json_data:
+    commit_id = json_data['currentVersion']
+  elif 'buildLabel' in json_data:
+    commit_id = json_data['buildLabel']
+  else:
+    commit_id = ''
+  info = {
+      'url': '',
+      'hash': commit_id,
+      'branch': version_label,
+      'piper_id': json_data.get('piperOriginRevId', '')
+  }
+  return info
+def _configure_tpu_version(tpu_name, version_label, new_version_id):
+  """Returns the current tpu version after resetting to an optional version."""
+  # The tpu_name is arbitrary / user chosen unique string for this tpu.
+  logging.info('Trying to connect to tpu %s', tpu_name)
+  tpu_client = client.Client(tpu=tpu_name)
+  tpu_client.wait_for_healthy()
+  if new_version_id:
+    logging.info('Trying to reset tpu version to %s', new_version_id)
+    tpu_client.configure_tpu_version(version=new_version_id)
+    tpu_client.wait_for_healthy()
+    logging.info('TPU healthy after version reset.')
+  else:
+    logging.info('Using the default tpu version id.')
+  workers = tpu_client.network_endpoints()
+  if workers:
+    ip_addr = workers[0]['ipAddress']
+    url = 'http://{}:8475/requestversion'.format(ip_addr)
+    return _get_version_info(url, version_label)
+  else:
+    logging.error('No tpu endpoint info')
+    return {
+        'url': '',
+        'hash': '',
+        'branch': version_label,
+        'piper_id': '',
+    }
+def configure_tpu(tpu_params):
+  return _configure_tpu_version(
+      tpu_params.get('name'),
+      version_label=tpu_params.get('version'),
+      new_version_id=tpu_params.get('version_id'))
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/perfzero/utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""PerfZero utility methods."""
+from __future__ import print_function
+import importlib
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import threading
+import traceback
+import requests
+import json
+import re
+def create_empty_file(parent_directory, file_basename):
+  """Creates an empty file with a given basename in a parent directory.
+  Creates parent_directory and intermediate directories if it doesn't exist.
+  This is mostly used for creating no-op actions in the Dockerfile.
+  Args:
+    parent_directory: The path to the parent directory.
+    file_basename: The basename for the empty file.
+  """
+  if not os.path.isdir(parent_directory):
+    os.makedirs(parent_directory)
+  full_file_name = os.path.join(parent_directory, file_basename)
+  with open(full_file_name, 'w'):
+    print('Creating empty file: {}'.format(full_file_name))
+def checkout_git_repos(git_repos, use_cached_site_packages):
+  """Clone, update, or sync a repo.
+  Args:
+    git_repos: array of dict containing attributes of the git repo to checkout.
+    use_cached_site_packages: If true, skip git pull if git_repo already exists.
+  Returns:
+    A dict containing attributes of the git repositories
+  """
+  site_package_info = {}
+  for repo in git_repos:
+    logging.info('Checking out repository from %s to %s',
+                 repo['url'], repo['local_path'])
+    if not os.path.isdir(repo['local_path']):
+      run_commands(['git clone {} {}'.format(repo['url'], repo['local_path'])])
+    if 'branch' in repo:
+      run_commands(['git -C {} checkout {}'.format(
+          repo['local_path'], repo['branch'])])
+    if not use_cached_site_packages or 'git_hash' in repo:
+      run_commands(['git -C {} pull --rebase'.format(repo['local_path'])])
+    if 'git_hash' in repo:
+      run_commands(['git -C {} reset --hard {}'.format(
+          repo['local_path'], repo['git_hash'])])
+    logging.info('Checked-out repository from %s to %s',
+                 repo['url'], repo['local_path'])
+    site_package_info[repo['dir_name']] = get_git_repo_info(repo['local_path'])
+  return site_package_info
+def get_git_repo_info(local_path):
+  """Get information of the git repository specified by the local_path."""
+  git_repo_info = {}
+  # Get git url
+  cmd = 'git -C {} config --get remote.origin.url'.format(local_path)
+  exit_code, result = run_command(cmd)
+  lines = result.splitlines()
+  if exit_code == 0 and lines:
+    git_repo_info['url'] = lines[0]
+  else:
+    logging.error('Error getting git url for repository %s due to %s',
+                  local_path, result)
+    return {}
+  # Get git branch
+  cmd = 'git -C {} rev-parse --abbrev-ref HEAD'.format(local_path)
+  exit_code, result = run_command(cmd)
+  lines = result.splitlines()
+  if exit_code == 0 and lines:
+    git_repo_info['branch'] = lines[0]
+  else:
+    logging.error('Error getting git branch for repository %s due to %s',
+                  local_path, result)
+    return {}
+  # Get git hash
+  cmd = 'git -C {} rev-parse HEAD'.format(local_path)
+  exit_code, result = run_command(cmd)
+  lines = result.splitlines()
+  if exit_code == 0 and lines:
+    git_repo_info['hash'] = lines[0]
+  else:
+    logging.error('Error getting git hash for repository %s due to %s',
+                  local_path, result)
+    return {}
+  return git_repo_info
+def setup_python_path(site_packages_dir, python_path_str):
+  if python_path_str:
+    python_paths = python_path_str.split(',')
+    for python_path in python_paths:
+      logging.info('Adding path %s to sys.path', python_path)
+      sys.path.append(os.path.join(site_packages_dir, python_path))
+  logging.debug('PYTHONPATH: %s', sys.path)
+def active_gcloud_service(gcloud_key_file_url, workspace_dir,
+                          download_only=False):
+  """Download key file and setup gcloud service credential using the key file.
+  Args:
+    gcloud_key_file_url: gcloud key file url
+    workspace_dir: directory that the key file is downloaded to
+    download_only: skip setting up the gcloud service credential if this is true
+  """
+  if not gcloud_key_file_url:
+    return
+  local_path = os.path.join(workspace_dir,
+                            os.path.basename(gcloud_key_file_url))
+  if not os.path.exists(local_path):
+    download_data([{'url': gcloud_key_file_url, 'local_path': local_path}])
+  if not download_only:
+    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = local_path
+    run_commands(['gcloud auth activate-service-account --key-file {}'.format(
+        local_path)])
+    logging.info('Activated gcloud service account credential')
+def setup_gsutil_credential():
+  run_commands(['gcloud config set pass_credentials_to_gsutil true'])
+def download_data(download_infos):
+  """Download data from url to local_path for each (url, local_path) pair in the download_infos.
+  Each url should start with either gs://, http:// or https://
+  Downloaded file whose name ends with .gz will be decompressed in its
+  current directory
+  Args:
+    download_infos: array of dict which specifies the url and local_path for
+      data download
+  """
+  for info in download_infos:
+    if os.path.exists(info['local_path']):
+      continue
+    original_base_name = os.path.basename(info['url'])
+    expected_base_name = os.path.basename(info['local_path'])
+    local_path_parent = os.path.dirname(info['local_path'])
+    logging.info('Downloading data from %s to %s',
+                 info['url'], info['local_path'])
+    make_dir_if_not_exist(local_path_parent)
+    # Download data to the local path
+    if info['url'].startswith('http://') or info['url'].startswith('https://'):
+      request = requests.get(info['url'], allow_redirects=True)
+      f = open(info['local_path'], 'wb')
+      f.write(request.content)
+      f.close()
+    elif info['url'].startswith('gs://'):
+      cmd = ['gsutil', '-m', 'cp', '-r', '-n', info['url'], local_path_parent]
+      run_commands([cmd], shell=False)
+    elif info['url'].startswith('file://'):
+      cmd = ['cp', info['url'][7:], local_path_parent]
+      run_commands([cmd], shell=False)
+    else:
+      raise ValueError('Url {} with prefix {} is not supported.'.format(
+          info['url'], info['url'].split(':')[0]))
+    # Move data to the expected local path
+    if original_base_name != expected_base_name:
+      run_commands(['mv {} {}'.format(
+          os.path.join(local_path_parent, original_base_name),
+          os.path.join(local_path_parent, expected_base_name))])
+    logging.info('Downloaded data from %s to %s',
+                 info['url'], info['local_path'])
+    # Decompress file if file name ends with .gz unless caller sets 'decompress'
+    # to False in info.
+    if info['url'].endswith('.gz') and info.get('decompress', True):
+      run_commands(['tar xvf {} -C {}'.format(
+          info['local_path'], local_path_parent)])
+      logging.info('Decompressed file %s', info['local_path'])
+def parse_data_downloads_str(root_data_dir, data_downloads_str):
+  """Parse a comma separated string into array of dicts.
+  Each dict specifies the url and local_path for a download.
+  Args:
+    root_data_dir: the directory which should contain all the dataset files
+    data_downloads_str: a comma separated string specified by the
+      flag --data_downloads
+  Returns:
+    An array of dict which specifies the url and local_path for data download
+  """
+  download_infos = []
+  if not data_downloads_str:
+    return download_infos
+  for entry in data_downloads_str.split(','):
+    info = {}
+    if ';' in entry:
+      info['url'] = entry.split(';')[0]
+      info['local_path'] = os.path.join(root_data_dir, entry.split(';')[1])
+    else:
+      info['url'] = entry
+      info['local_path'] = os.path.join(root_data_dir, os.path.basename(entry))
+    # Canonicalize url to remove trailing '/' and '*'
+    if info['url'].endswith('*'):
+      info['url'] = info['url'][:-1]
+    if info['url'].endswith('/'):
+      info['url'] = info['url'][:-1]
+    download_infos.append(info)
+  return download_infos
+def maybe_upload_to_gcs(local_dir, output_gcs_url):
+  if not output_gcs_url:
+    return
+  run_commands(['gsutil -m cp -r {} {}'.format(local_dir, output_gcs_url)])
+  logging.info('Uploaded data from local directory %s to gcs %s',
+               local_dir, output_gcs_url)
+def make_dir_if_not_exist(local_path):
+  if not os.path.exists(local_path):
+    os.makedirs(local_path)
+    logging.info('Created directory %s', local_path)
+def run_command(cmd, shell=True):
+  """Structures for a variety of different test results.
+  Args:
+    cmd: Command to execute
+    shell: True to use shell, false otherwise.
+  Returns:
+    Tuple of the command return value and the standard out in as a string.
+  """
+  logging.debug('Executing command: %s', cmd)
+  p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                       stderr=subprocess.STDOUT, shell=shell)
+  exit_code = None
+  line = ''
+  stdout = ''
+  while exit_code is None or line:
+    exit_code = p.poll()
+    line = p.stdout.readline().decode('utf-8')
+    stdout += line
+    logging.debug(line)
+  return exit_code, stdout
+def run_commands(cmds, shell=True):
+  """Runs list of command and throw error if any fail."""
+  for cmd in cmds:
+    exit_code, stdout = run_command(cmd, shell=shell)
+    if exit_code:
+      raise Exception('"{}" failed with code:{} and stdout:\n{}'.format(
+          cmd, exit_code, stdout))
+def get_cpu_name():
+  cmd = "cat /proc/cpuinfo | grep 'model name' | sort --unique"
+  exit_code, result = run_command(cmd)
+  lines = result.splitlines()
+  if exit_code == 0 and lines:
+    model_name_parts = lines[0].split(':')
+    return model_name_parts[1].strip()
+  else:
+    logging.error('Error getting cpuinfo model name: %s', result)
+    return ''
+def get_cpu_socket_count():
+  cmd = 'grep -i "physical id" /proc/cpuinfo | sort -u | wc -l'
+  exit_code, result = run_command(cmd)
+  lines = result.splitlines()
+  if exit_code == 0 and lines:
+    return int(lines[0])
+  else:
+    logging.error('Error getting cpuinfo scocket count: %s', result)
+    return -1
+def _get_amd_gpu_info():
+  """Returns gpu information using rocm-smi.
+  Note: Assumes if the system has multiple GPUs, that they are all the same
+  Returns:
+    A dict containing gpu_driver_version, gpu_model and gpu_count or None if
+    `rocm-smi` is not found or fails.
+  """
+  cmd = 'rocm-smi --json --showproductname --showdriverversion'
+  exit_code, result = run_command(cmd)
+  if exit_code != 0:
+    logging.error('rocm-smi did not return as expected: %s', result)
+    return None
+  def get_gpu_driver_version(rocm_smi_output):
+    return rocm_smi_output['system']['Driver version']
+  def get_gpu_model(rocm_smi_output):
+    gpu_model = ""
+    for key, value in rocm_smi_output.items():
+      if re.match("card[0-9]+", key):
+        gpu_model = value['Card SKU']
+        break
+    return gpu_model
+  def get_gpu_count(rocm_smi_output):
+    gpu_count = 0
+    for key, value in rocm_smi_output.items():
+      if re.match("card[0-9]+", key):
+        gpu_count += 1
+    return gpu_count
+  rocm_smi_output= json.loads(result)
+  gpu_info = {}
+  gpu_info['gpu_driver_version'] = get_gpu_driver_version(rocm_smi_output)
+  gpu_info['gpu_model'] = get_gpu_model(rocm_smi_output)
+  gpu_info['gpu_count'] = get_gpu_count(rocm_smi_output)
+  return gpu_info
+def _get_nvidia_gpu_info():
+  """Returns gpu information using nvidia-smi.
+  Note: Assumes if the system has multiple GPUs that they are all the same with
+  one exception.  If the first result is a Quadro, the heuristic assumes
+  this may be a workstation and takes the second entry.
+  Returns:
+    A dict containing gpu_driver_version, gpu_model and gpu_count or None if
+    `nvidia-smi` is not found or fails.
+  """
+  cmd = 'nvidia-smi --query-gpu=driver_version,gpu_name --format=csv'
+  exit_code, result = run_command(cmd)
+  if exit_code != 0:
+    logging.error('nvidia-smi did not return as expected: %s', result)
+    return None
+  lines = result.splitlines()
+  gpu_info_line = lines[1]
+  if 'Quadro' in gpu_info_line and len(lines) >= 3:
+    gpu_info_line = lines[2]
+  gpu_info = {}
+  gpu_info['gpu_driver_version'] = gpu_info_line.split(',')[0].strip()
+  gpu_info['gpu_model'] = gpu_info_line.split(',')[1].strip()
+  gpu_info['gpu_count'] = len(lines) - 1
+  return gpu_info
+def get_gpu_info():
+  """Returns gpu information using either nvidia-smi or rocm-smi.
+  Returns:
+    A dict containing gpu_driver_version, gpu_model and gpu_count or None if
+    `nvidia-smi` is not found or fails.
+  """
+  return _get_amd_gpu_info() if shutil.which("rocm-smi") \
+    else _get_nvidia_gpu_info()
+def _install_tpu_tool():
+  """Installs the ctpu tool to managing cloud TPUs.
+  Follows the instructions here:
+  https://github.com/tensorflow/tpu/tree/master/tools/ctpu
+  """
+  if not os.path.exists('ctpu'):
+    logging.info('Installing TPU tool')
+    commands = [
+        'wget https://dl.google.com/cloud_tpu/ctpu/latest/linux/ctpu',
+        'chmod a+x ctpu',
+    ]
+    run_commands(commands)
+def setup_tpu(parameters):
+  """Sets up a TPU with a given set of parameters.
+  Args:
+    parameters: dictionary of TPU parameters.
+  Returns:
+    True if an error occurs during setup.
+  """
+  try:
+    _install_tpu_tool()
+    args = [
+        '--name={}'.format(parameters.get('name')),
+        '--project={}'.format(parameters.get('project')),
+        '--zone={}'.format(parameters.get('zone')),
+        '--tpu-size={}'.format(parameters.get('size')),
+        '--tf-version={}'.format(parameters.get('version')),
+        '--tpu-only',
+        '-noconf',
+    ]
+    command = './ctpu up {}'.format(' '.join(args))
+    logging.info('Setting up TPU: %s', command)
+    exit_code, output = run_command(command)
+    if exit_code != 0:
+      logging.error('Error in setup with output: %s', output)
+    return exit_code != 0
+  except Exception:
+    logging.error('Unable to setup TPU')
+    run_command('rm -f ctpu')
+    sys.exit(1)
+def cleanup_tpu(parameters):
+  """Cleans up an existing TPU.
+  Args:
+    parameters: dictionary of TPU parameters.
+  Returns:
+    True if an error occurs during cleanup.
+  """
+  _install_tpu_tool()
+  args = [
+      '--name={}'.format(parameters.get('name')),
+      '--project={}'.format(parameters.get('project')),
+      '--zone={}'.format(parameters.get('zone')),
+      '--tpu-only',
+      '-noconf',
+  ]
+  command = './ctpu delete {}'.format(' '.join(args))
+  logging.info('Cleaning up TPU: %s', command)
+  exit_code, output = run_command(command)
+  if exit_code != 0:
+    logging.error('Error in cleanup with output: %s', output)
+  return exit_code != 0
+def read_benchmark_result(benchmark_result_file_path):
+  """Read benchmark result from the protobuf file."""
+  from google.protobuf import json_format  # pylint: disable=g-import-not-at-top
+  from tensorflow.core.util import test_log_pb2  # pylint: disable=g-import-not-at-top
+  if not os.path.isfile(benchmark_result_file_path):
+    logging.error('Failed to read benchmark result because '
+                  'file %s does not exist', benchmark_result_file_path)
+    return {}
+  with open(benchmark_result_file_path, 'rb') as f:
+    benchmark_entries = test_log_pb2.BenchmarkEntries()
+    benchmark_entries.ParseFromString(f.read())
+    return json_format.MessageToDict(
+        benchmark_entries,
+        preserving_proto_field_name=True,
+        including_default_value_fields=True)['entry'][0]
+def print_thread_stacktrace():
+  print('Here is the stacktrace for all threads:')
+  thread_names = {t.ident: t.name for t in threading.enumerate()}
+  for thread_id, frame in sys._current_frames().items():  # pylint: disable=protected-access
+    print('Thread {}'.format(thread_names.get(thread_id, thread_id)))
+    traceback.print_stack(frame)
+def instantiate_benchmark_class(
+    benchmark_class, output_dir, root_data_dir, tpu, constructor_args,
+    benchmark_class_type=None):
+  """Return initialized benchmark class."""
+  module_import_path, class_name = benchmark_class.rsplit('.', 1)
+  module = importlib.import_module(module_import_path)
+  class_ = getattr(module, class_name)
+  if benchmark_class_type == 'tf_benchmark':
+    # for benchmarks inheriting from tf.test.Benchmark, instantiate them directly.
+    instance = class_(**constructor_args)
+  else:
+    # Default instantiation for perfzero_benchmark classes.
+    instance = class_(
+        output_dir=output_dir,
+        root_data_dir=root_data_dir,
+        tpu=tpu,
+        **constructor_args)
+  return instance
+def copy_and_rename_dirs(dir_spec_string, dst_base_dir):
+  """Copies list of <dir-path>:new_name specs into a new dest dir.
+  If a path /path1/path2/dir:new_dir is given, it copies /path1/path2/dir to
+  dst_base_dir/new_dir.
+  Args:
+    dir_spec_string: Comma separated list of /path1/path2:new_name specs.
+    dst_base_dir: The base dir to contain the copies.
+  """
+  if not dir_spec_string:
+    return
+  dir_specs = dir_spec_string.split(',')
+  for src_dir_with_name in dir_specs:
+    src_dir, final_basename = src_dir_with_name.split(':')
+    dst_dir = os.path.join(dst_base_dir, final_basename)
+    if os.path.isdir(dst_dir):
+      logging.info('[DELETE] pre-existing %s', dst_dir)
+      shutil.rmtree(dst_dir)
+    logging.info('[COPY] %s -> %s', src_dir, dst_dir)
+    shutil.copytree(src_dir, dst_dir)
--- a/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/utils_test.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/perfzero/lib/perfzero/utils_test.py
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/setup.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/lib/setup.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Checkout repository, download data and build docker image."""
+from __future__ import print_function
+import argparse
+import json
+import logging
+import os
+import shutil
+import sys
+import tempfile
+import time
+import perfzero.device_utils as device_utils
+import perfzero.perfzero_config as perfzero_config
+import perfzero.utils as utils
+def _temporary_file_name(parent_dir, base_name):
+  """Returns a temp name of the form <parent-dir>/<random>/<base-name>."""
+  if not os.path.isdir(parent_dir):
+    os.makedirs(parent_dir)
+  temp_dir = tempfile.mkdtemp(dir=parent_dir)
+  return os.path.join(temp_dir, base_name)
+def _load_docker_image(FLAGS, workspace_dir, setup_execution_time):
+  """Runs docker load --input_image <FLAGS.dockerfile_path>.
+  Fetches FLAGS.dockerfile_path to workspace_dir/<temp-dir>/local_docker first.
+  Runs docker load --input <path-to-local-docker>.
+  Deletes workspace_dir/<temp-dir> after the docker image is loaded.
+  Args:
+    FLAGS: parser.parse_known_args object.
+    workspace_dir: String - The path to use for intermediate artifacts.
+    setup_execution_time: Map from string->double containing wall times for
+      different operations. This will have insertions describing the docker
+      setup time.
+  """
+  load_docker_start_time = time.time()
+  local_docker_image_path = _temporary_file_name(workspace_dir, 'local_docker')
+  utils.download_data([{'url': FLAGS.dockerfile_path,
+                        'local_path': local_docker_image_path,
+                        'decompress': False}])
+  setup_execution_time['fetch_docker'] = time.time() - load_docker_start_time
+  docker_load_cmd = 'docker load --input {}'.format(local_docker_image_path)
+  try:
+    utils.run_commands(
+        [docker_load_cmd,
+         'docker images'  # Print loaded image list.
+        ])
+    setup_execution_time['load_docker'] = time.time() - load_docker_start_time
+  finally:
+    logging.info('removing parent dir of local docker image copy %s',
+                 local_docker_image_path)
+    shutil.rmtree(os.path.dirname(local_docker_image_path))
+def _create_docker_image(FLAGS, project_dir, workspace_dir,
+                         setup_execution_time):
+  """Creates a docker image.
+  Args:
+    FLAGS: parser.parse_known_args object.
+    project_dir: String - The current project path.
+    workspace_dir: String - The path to use for intermediate artifacts.
+    setup_execution_time: Map from string->double containing wall times for
+      different operations. This will have insertions describing the docker
+      setup time.
+  """
+  # Create docker image
+  docker_start_time = time.time()
+  docker_context = os.path.join(workspace_dir, 'resources')
+  # Necessary in case we don't have a local .whl file.
+  utils.create_empty_file(docker_context, 'EMPTY')
+  # Download TensorFlow pip package from Google Cloud Storage and modify package
+  # path accordingly, if applicable
+  local_tensorflow_pip_spec = None
+  if (FLAGS.tensorflow_pip_spec and
+      (FLAGS.tensorflow_pip_spec.startswith('gs://') or
+       FLAGS.tensorflow_pip_spec.startswith('file://'))):
+    local_pip_filename = os.path.basename(FLAGS.tensorflow_pip_spec)
+    local_pip_path = os.path.join(docker_context, local_pip_filename)
+    utils.download_data([{'url': FLAGS.tensorflow_pip_spec,
+                          'local_path': local_pip_path}])
+    # Update path to pip wheel file for the Dockerfile. Note that this path has
+    # to be relative to the docker context (absolute path will not work).
+    FLAGS.tensorflow_pip_spec = local_pip_filename
+    local_tensorflow_pip_spec = local_pip_filename
+  else:
+    local_tensorflow_pip_spec = 'EMPTY'
+  dockerfile_path = FLAGS.dockerfile_path
+  if not os.path.exists(dockerfile_path):
+    # Fall back to the deprecated approach if the user-specified
+    # dockerfile_path does not exist
+    dockerfile_path = os.path.join(project_dir, FLAGS.dockerfile_path)
+  extra_pip_specs = (FLAGS.extra_pip_specs or '').replace(';', '')
+  docker_base_cmd = 'docker build --no-cache --pull'
+  # FLAGS.extra_docker_build_args will be a list of strings (e.g. ['a', 'b=c']).
+  # We treat the strings directly as build-args: --build-arg a --build-arg b=c
+  # Empty strings are ignored.
+  extra_docker_build_args = ' '.join([
+      '--build-arg %s' % arg for arg in FLAGS.extra_docker_build_args if arg])
+  cmd = '{docker_base_cmd} -t {docker_tag}{tf_pip}{local_tf_pip}{extra_pip}{extra_docker_build_args} {suffix}'.format(
+      docker_base_cmd=docker_base_cmd,
+      docker_tag=FLAGS.docker_tag,
+      tf_pip=(
+          ' --build-arg tensorflow_pip_spec={}'.format(
+              FLAGS.tensorflow_pip_spec) if FLAGS.tensorflow_pip_spec else ''),
+      # local_tensorflow_pip_spec is either string 'EMPTY' or basename of
+      # local .whl file.
+      local_tf_pip=' --build-arg local_tensorflow_pip_spec={}'.format(
+          local_tensorflow_pip_spec),
+      extra_pip=' --build-arg extra_pip_specs=\'{}\''.format(extra_pip_specs),
+      extra_docker_build_args=' ' + extra_docker_build_args,
+      suffix=(
+          '-f {} {}'.format(dockerfile_path, docker_context)
+          if docker_context else '- < {}'.format(dockerfile_path))
+  )
+  utils.run_commands([cmd])
+  logging.info('Built docker image with tag %s', FLAGS.docker_tag)
+  setup_execution_time['build_docker'] = time.time() - docker_start_time
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(
+      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+  perfzero_config.add_setup_parser_arguments(parser)
+  FLAGS, unparsed = parser.parse_known_args()
+  logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
+                      level=logging.DEBUG)
+  if unparsed:
+    logging.error('Arguments %s are not recognized', unparsed)
+    sys.exit(1)
+  setup_execution_time = {}
+  project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+  workspace_dir = os.path.join(project_dir, FLAGS.workspace)
+  site_package_dir = os.path.join(workspace_dir, 'site-packages')
+  utils.copy_and_rename_dirs(FLAGS.site_package_downloads,
+                             site_package_dir)
+  activate_gcloud = False
+  if FLAGS.dockerfile_path and FLAGS.dockerfile_path.startswith('gs://'):
+    # We might end up doing gsutil fetch later, so need to call
+    # active_gcloud_service().
+    activate_gcloud = True
+  if FLAGS.tensorflow_pip_spec and FLAGS.tensorflow_pip_spec.startswith('gs://'):
+    activate_gcloud = True
+  # Download gcloud auth token. Remove this operation in the future when
+  # docker in Kokoro can accesss the GCP metadata server
+  start_time = time.time()
+  utils.active_gcloud_service(FLAGS.gcloud_key_file_url,
+                              workspace_dir, download_only=not activate_gcloud)
+  setup_execution_time['download_token'] = time.time() - start_time
+  # Set up the raid array.
+  start_time = time.time()
+  device_utils.create_drive_from_devices(FLAGS.root_data_dir,
+                                         FLAGS.gce_nvme_raid)
+  setup_execution_time['create_drive'] = time.time() - start_time
+  if FLAGS.dockerfile_path:
+    if FLAGS.dockerfile_path.endswith('.tar.gz'):
+      logging.info('Assuming given file %s is a docker image to load',
+                   FLAGS.dockerfile_path)
+      _load_docker_image(FLAGS, workspace_dir,
+                         setup_execution_time)
+    else:
+      _create_docker_image(FLAGS, project_dir, workspace_dir,
+                           setup_execution_time)
+  logging.info('Setup time in seconds by operation:\n %s',
+               json.dumps(setup_execution_time, indent=2))
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/screenshots/profiling_overview.png
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/screenshots/profiling_overview.png
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/screenshots/profiling_trace_view.png
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/screenshots/profiling_trace_view.png
--- a/TensorFlow/ComputeVision/Classification/benchmark/perfzero/scripts/create_big_table.txt
+++ b/TensorFlow/ComputeVision/Classification/benchmark/perfzero/scripts/create_big_table.txt
--- a/TensorFlow/ComputeVision/Classification/benchmark/perfzero/scripts/generate-readme-header.sh
+++ b/TensorFlow/ComputeVision/Classification/benchmark/perfzero/scripts/generate-readme-header.sh
--- a/TensorFlow/ComputeVision/Classification/benchmark/perfzero/scripts/plot_process_info.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/perfzero/scripts/plot_process_info.py
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/README.md
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/README.md
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/all_reduce_benchmark.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks the all-reduce algorithms of tf_cnn_benchmarks.
+tf_cnn_benchmarks uses all-reduce to aggregate gradients. This benchmark is
+useful for benchmarking the performance of just this gradient aggregation,
+instead of the entire model. All the flags that tf_cnn_benchmarks accepts are
+also accepted by this script, although many are silently ignored.
+The number and shapes of the tensors all-reduced are those of the variables of
+the model specified by the --model flag.
+TODO(reedwm): Allow custom sizes to be specified.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import time
+from absl import app
+from absl import flags as absl_flags
+import tensorflow.compat.v1 as tf
+from tensorflow.python.ops import control_flow_ops
+import benchmark_cnn
+import cnn_util
+import flags
+from cnn_util import log_fn
+absl_flags.DEFINE_integer('iters_per_step', 5,
+                          'Number of iterations to run all-reduce for, per '
+                          'step. Every step, a session will be run on a Graph '
+                          'that contains this many copies of the all-reduce. '
+                          'The copies are run sequentially. Setting this above '
+                          '1 is useful to lower the overhead of starting the '
+                          'session run, running the VariableV2 ops at the '
+                          'start of the step, etc.')
+flags.define_flags()
+for name in flags.param_specs.keys():
+  absl_flags.declare_key_flag(name)
+def get_var_shapes(model):
+  """Returns the list of variable shapes for a tf_cnn_benchmarks Model."""
+  with tf.Graph().as_default():
+    # The variable shapes do not depend on the batch size.
+    images = tf.placeholder(tf.float32, model.get_input_shapes('train')[0])
+    model.build_network([images])
+    return [[int(d) for d in v.shape.dims] for v in tf.trainable_variables()]
+def all_reduce(all_device_tensors, variable_mgr):
+  """Performs a single batch all-reduce.
+  Args:
+    all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
+      a tensor, where t is the tower the tensor is on and i is the index of
+      the tensor.
+    variable_mgr: The VariableMgr to perform the all-reduce.
+  Returns:
+    List of list of tensors in the same form as `all_device_tensors`, except the
+    tensors are aggregated across towers.
+  """
+  tower_grads = [[(g, None) for g in device_tensors] for
+                 device_tensors in all_device_tensors]
+  _, aggregated_tower_grads = variable_mgr.preprocess_device_grads(tower_grads)
+  return [
+      [g for g, _ in agg_device_tensors]
+      for agg_device_tensors in aggregated_tower_grads]
+def build_all_reduce_iterations(all_device_tensors, tower_devices, variable_mgr,
+                                num_iters):
+  """Builds the all-reduce ops for multiple iterations to aggregate tensors.
+  The tensors in `all_device_tensors` are aggregated `num_iters` times. Each
+  iteration aggregates the results from the previous iteration. The iterations
+  are run sequentially, so the aggregations for an iteration do not start
+  running until the previous iteration has completed. Each iteration after the
+  first is aggregating already-aggregated values, but it does not matter because
+  we are only aggregating for benchmarking purposes.
+  Args:
+    all_device_tensors: List of lists of tensors. all_device_tensors[t][i] is
+      a tensor, where t is the tower the tensor is on and i is the index of
+      the tensor.
+    tower_devices: A list of device strings. tower_devices[t] is the device
+      of the tensors in all_device_tensors[t].
+    variable_mgr: The VariableMgr to perform the all-reduce.
+    num_iters: Number of iterations to aggregate tensors for.
+  Returns:
+    An op that when run, causes the all-reduce ops to run.
+  """
+  for i in range(num_iters):
+    with tf.name_scope('iteration_%d' % i):
+      # Step 1: Do the aggregation.
+      with tf.name_scope('tensor_aggregation'):
+        all_device_tensors = all_reduce(all_device_tensors, variable_mgr)
+      # Step 2. Create identity ops, to bring the aggregated results back to
+      # each device.
+      new_all_device_tensors = []
+      for device, device_tensors in zip(tower_devices, all_device_tensors):
+        with tf.device(device):
+          new_all_device_tensors.append([
+              tf.identity(t, name='identity_after_allreduce')
+              for t in device_tensors
+          ])
+      all_device_tensors = new_all_device_tensors
+      # Step 3. Add control dependencies to delay the next iteration until this
+      # iteration is complete. To avoid extra overhead, we do not have any
+      # cross-device control dependencies, which means it's possible for two
+      # iterations to slightly overlap.
+      new_all_device_tensors = []
+      for device_tensors in all_device_tensors:
+        new_all_device_tensors.append([
+            control_flow_ops.with_dependencies(
+                device_tensors, t, name='identity_after_dependencies')
+            for t in device_tensors
+        ])
+      all_device_tensors = new_all_device_tensors
+  # To prevent the dependency optimizer from removing every op we created,
+  # we store the results in variables.
+  ops_to_run = []
+  for device, device_tensors in zip(tower_devices, all_device_tensors):
+    with tf.device(device):
+      for t in device_tensors:
+        # The placeholder initial value is never run.
+        var = tf.Variable(tf.placeholder(tf.float32, t.shape), collections=[])
+        ops_to_run.append(var.assign(t))
+  return tf.group(*ops_to_run)
+def build_graph(tower_devices, tensor_shapes, variable_mgr, num_iters):
+  """Builds the graph for the benchmark.
+  Args:
+    tower_devices: A list of device strings of the devices to run the all-reduce
+      benchmark on.
+    tensor_shapes: A list of shapes of the tensors that will be aggregated for
+      the all-reduce.
+    variable_mgr: The VariableMgr to perform the all-reduce.
+    num_iters: Number of iterations to aggregate tensors for.
+  Returns:
+    An op that runs the benchmark.
+  """
+  all_device_tensors = []
+  for i, tower_device in enumerate(tower_devices):
+    with tf.device(tower_device):
+      device_tensors = []
+      for j, shape in enumerate(tensor_shapes):
+        tensor = tf.Variable(tf.random_normal(shape, dtype=tf.float32),
+                             name='tensor_%d_on_device_%d' % (j, i))
+        device_tensors.append(tensor)
+    all_device_tensors.append(device_tensors)
+  log_fn('Building all-reduce ops')
+  benchmark_op = build_all_reduce_iterations(all_device_tensors, tower_devices,
+                                             variable_mgr, num_iters)
+  log_fn('Done building all-reduce ops')
+  return benchmark_op
+def run_graph(benchmark_op, bench_cnn, init_ops, dummy_loss_op):
+  """Runs the graph for the benchmark.
+  Args:
+    benchmark_op: An op that runs the benchmark.
+    bench_cnn: The BenchmarkCNN where params and other attributes are obtained.
+    init_ops: A list of ops that are run before `benchmark_op` for
+      initialization.
+    dummy_loss_op: Any op. We must pass a loss op to
+      `benchmark_cnn.benchmark_one_step`, but the result of the op is never
+      actually used.
+  """
+  config = benchmark_cnn.create_config_proto(bench_cnn.params)
+  with tf.Session(config=config) as sess:
+    for op in init_ops:
+      sess.run(op)
+    step_train_times = []
+    fetches = {'average_loss': dummy_loss_op, 'benchmark_op': benchmark_op}
+    log_fn('Running warmup')
+    for i in range(-bench_cnn.num_warmup_batches, bench_cnn.num_batches):
+      if i == 0:
+        log_fn('Running all-reduce ops')
+        start = time.perf_counter()
+      if i > 0 and i % bench_cnn.params.display_every == 0:
+        log_fn('Iteration: %d. Average time per step so far: %s' %
+               (i, (time.perf_counter() - start) / i))
+      # Call benchmark_one_step instead of directly calling sess.run(...), to
+      # potentially get a trace file, partitioned graphs, etc.
+      benchmark_cnn.benchmark_one_step(
+          sess=sess,
+          fetches=fetches,
+          step=i,
+          # The batch size is only used for the images/sec calculation, which is
+          # not actually calculated because we pass show_images_per_sec=False.
+          batch_size=None,
+          step_train_times=step_train_times,
+          trace_filename=bench_cnn.trace_filename,
+          partitioned_graph_file_prefix=(
+              bench_cnn.params.partitioned_graph_file_prefix),
+          profiler=None,
+          image_producer=None,
+          params=bench_cnn.params,
+          show_images_per_sec=False)
+    log_fn('Average time per step: %s' %
+           ((time.perf_counter() - start) / bench_cnn.num_batches))
+def run_benchmark(bench_cnn, num_iters):
+  """Runs the all-reduce benchmark.
+  Args:
+    bench_cnn: The BenchmarkCNN where params, the variable manager, and other
+      attributes are obtained.
+    num_iters: Number of iterations to do all-reduce for for.
+  Raises:
+    ValueError: Invalid params of bench_cnn.
+  """
+  if bench_cnn.params.variable_update != 'replicated':
+    raise ValueError('--variable_update=replicated must be specified to use'
+                     'the all-reduce benchmark')
+  if bench_cnn.params.variable_consistency == 'relaxed':
+    raise ValueError('--variable_consistency=relaxed is not supported')
+  benchmark_op = build_graph(bench_cnn.raw_devices,
+                             get_var_shapes(bench_cnn.model),
+                             bench_cnn.variable_mgr, num_iters)
+  init_ops = [
+      tf.global_variables_initializer(),
+      bench_cnn.variable_mgr.get_post_init_ops()
+  ]
+  loss_op = tf.no_op()
+  if bench_cnn.graph_file:
+    path, filename = os.path.split(bench_cnn.graph_file)
+    as_text = filename.endswith('txt')
+    log_fn('Writing GraphDef as %s to %s' % (
+        'text' if as_text else 'binary', bench_cnn.graph_file))
+    tf.train.write_graph(tf.get_default_graph().as_graph_def(add_shapes=True),
+                         path, filename, as_text)
+  run_graph(benchmark_op, bench_cnn, init_ops, loss_op)
+# TODO(reedwm): Reduce redundancy with tf_cnn_benchmarks
+def main(positional_arguments):
+  # Command-line arguments like '--distortions False' are equivalent to
+  # '--distortions=True False', where False is a positional argument. To prevent
+  # this from silently running with distortions, we do not allow positional
+  # arguments.
+  assert len(positional_arguments) >= 1
+  if len(positional_arguments) > 1:
+    raise ValueError('Received unknown positional arguments: %s'
+                     % positional_arguments[1:])
+  params = benchmark_cnn.make_params_from_flags()
+  params = benchmark_cnn.setup(params)
+  bench = benchmark_cnn.BenchmarkCNN(params)
+  tfversion = cnn_util.tensorflow_version_tuple()
+  log_fn('TensorFlow:  %i.%i' % (tfversion[0], tfversion[1]))
+  run_benchmark(bench, absl_flags.FLAGS.iters_per_step)
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  app.run(main)  # Raises error on invalid flags, unlike tf.app.run()
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark_test.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/all_reduce_benchmark_test.py
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/allreduce.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for allreduce."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections as pycoll
+import re
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+# pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
+try:
+  from tensorflow.python.distribute.v1 import all_reduce
+except ImportError:
+  # Compatibility with TF 2.4 and below
+  from tensorflow.python.distribute import all_reduce
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import collective_ops
+AllReduceSpecTuple = pycoll.namedtuple('AllReduceSpecTuple', 'alg shards limit')
+def parse_general_int(s):
+  """Parse integer with power-of-2 suffix eg. 32k."""
+  mo = re.match(r'(\d+)([KkMGT]?)$', s)
+  if mo:
+    i, suffix = mo.group(1, 2)
+    v = int(i)
+    if suffix:
+      if suffix == 'K' or suffix == 'k':
+        v *= 1024
+      elif suffix == 'M':
+        v *= (1024 * 1024)
+      elif suffix == 'G':
+        v *= (1024 * 1024 * 1024)
+      elif suffix == 'T':
+        v *= (1024 * 1024 * 1024 * 1024)
+      else:
+        raise ValueError('invalid integer string %s' % s)
+    return v
+  else:
+    v = int(s)
+  return v
+def parse_all_reduce_spec(all_reduce_spec):
+  """Parse all_reduce_spec.
+  Args:
+    all_reduce_spec: a string specifying a combination of all-reduce
+      algorithms to apply for gradient reduction.
+  Returns:
+    a list of AllReduceSpecTuple.
+  Raises:
+    ValueError: all_reduce_spec is not well-formed.
+  An all_reduce_spec has BNF form:
+     int ::= positive whole number
+     g_int ::= int[KkMGT]?
+     alg_spec ::= alg | alg#int
+     range_spec ::= alg_spec | alg_spec/alg_spec
+     spec ::= range_spec | range_spec:g_int:range_spec
+  Not all syntactically correct specifications are supported.
+  Examples of supported all_reduce_spec strings, with semantics explained:
+    'collective' == apply tf.collective_reduce operator to all tensors.
+    'collective#2' == apply tf.collective_reduce operator to all tensors,
+            requesting up to 2 simultaneous transfers at each node, if
+            feasible, by subdividing tensor by an additional factor of 2.
+    'xring' == apply ring all-reduce to all tensors
+    'xring#2' == apply ring all-reduce to all tensors, using two simultaneous
+            transfer rings, each operating on 1/2 of each tensor.
+    'nccl'  == apply NCCL all-reduce to all tensors (only works within
+            a single worker process where all devices are GPUs)
+    'nccl/xring' == apply NCCL all-reduce to all tensors within each worker
+            to produce at least one full-reduced (locally) value,
+            then apply ring all-reduce to one such value from each
+            worker, then apply NCCL broadcast to propagate those globally
+            reduced values back to every device within each worker.
+    'pscpu' == Shuffle reduce using worker CPUs as the gather devices: each
+            distributed tensor is reduced by copying all instances to
+            one of the worker CPUs, computing the reduction there, then
+            copying back to each participating device.  Tensor reductions
+            are assigned to specific CPUs round-robin.
+    'psgpu#4' == Arrange all GPUs across all workers into groups of 4.
+            Each distributed tensor is shuffle reduced against one
+            such group of 4 GPUs, selected round-robin.  That is, each
+            tensor is split across 4 shards for the reduction.
+    'pscpu:2k:pscpu#2:64k:xring' == Apply single-shard pscpu to
+            tensors of size <= 2048 elements, apply 2-shard pscpu to
+            tensors up to size 64k elements, apply xring to larger tensors.
+    'pscpu/pscpu#2' == Use shuffle gather to locally reduce each tensor on
+            the worker's CPU, then use 2-shard shuffle to reduce those
+            locally reduced tensors across workers (on the worker CPUs), then
+            scatter the globally reduced values locally from each worker CPU.
+  """
+  range_parts = all_reduce_spec.split(':') + ['-1']
+  if len(range_parts) % 2:
+    raise ValueError('all_reduce_spec not well formed: %s' % all_reduce_spec)
+  limit = 0
+  spec = []
+  alg = None
+  shards = 1
+  for i, range_part in enumerate(range_parts):
+    if i % 2 == 1:
+      try:
+        limit = parse_general_int(range_part)
+        spec.append(AllReduceSpecTuple(alg=alg, shards=shards, limit=limit))
+      except ValueError:
+        raise ValueError('all_reduce_spec (%s) contains non-integer range %s' %
+                         (all_reduce_spec, range_part))
+    else:
+      alg = range_part
+      alg_parts = range_part.split('#')
+      alg = alg_parts[0]
+      if len(alg_parts) > 1:
+        try:
+          shards = int(alg_parts[1])
+        except ValueError:
+          raise ValueError('all_reduce_spec (%s) contains non-integer '
+                           'shards %s' % all_reduce_spec, alg_parts[1])
+      else:
+        shards = 1
+      if alg not in [
+          'nccl', 'nccl/xring', 'nccl/rechd', 'nccl/pscpu', 'xring', 'pscpu',
+          'psgpu', 'pscpu/pscpu', 'collective'
+      ]:
+        raise ValueError('all_reduce_spec (%s) contains invalid alg %s' %
+                         (all_reduce_spec, alg))
+  return spec
+def build_all_reduce_device_prefixes(job_name, num_tasks):
+  """Build list of device prefix names for all_reduce.
+  Args:
+    job_name: 'worker', 'ps' or 'localhost'.
+    num_tasks: number of jobs across which device names should be generated.
+  Returns:
+     A list of device name prefix strings. Each element spells out the full
+     host name without adding the device.
+     e.g. '/job:worker/task:0'
+  """
+  if job_name != 'localhost':
+    return ['/job:%s/task:%d' % (job_name, d) for d in range(0, num_tasks)]
+  else:
+    assert num_tasks == 1
+    return ['/job:%s' % job_name]
+def group_device_names(devices, group_size):
+  """Group device names into groups of group_size.
+  Args:
+    devices: list of strings naming devices.
+    group_size: int >= 1
+  Returns:
+    list of lists of devices, where each inner list is group_size long,
+      and each device appears at least once in an inner list.  If
+      len(devices) % group_size = 0 then each device will appear
+      exactly once.
+  Raises:
+    ValueError: group_size > len(devices)
+  """
+  num_devices = len(devices)
+  if group_size > num_devices:
+    raise ValueError('only %d devices, but group_size=%d' % (num_devices,
+                                                             group_size))
+  num_groups = (
+      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
+  groups = [[] for i in range(num_groups)]
+  for i in range(0, num_groups * group_size):
+    groups[i % num_groups].append(devices[i % num_devices])
+  return groups
+def split_grads_by_size(threshold_size, device_grads):
+  """Break gradients into two sets according to tensor size.
+  Args:
+    threshold_size: int size cutoff for small vs large tensor.
+    device_grads: List of lists of (gradient, variable) tuples.  The outer
+        list is over devices. The inner list is over individual gradients.
+  Returns:
+    small_grads: Subset of device_grads where shape is <= theshold_size
+       elements.
+    large_grads: Subset of device_grads where shape is > threshold_size
+       elements.
+  """
+  small_grads = []
+  large_grads = []
+  for dl in device_grads:
+    small_dl = []
+    large_dl = []
+    for (g, v) in dl:
+      tensor_size = g.get_shape().num_elements()
+      if tensor_size <= threshold_size:
+        small_dl.append([g, v])
+      else:
+        large_dl.append([g, v])
+    if small_dl:
+      small_grads.append(small_dl)
+    if large_dl:
+      large_grads.append(large_dl)
+  return small_grads, large_grads
+_instance_key = 1
+def new_collective_instance_key():
+  """Returns a new instance key for use in defining a collective op."""
+  global _instance_key
+  v = _instance_key
+  _instance_key += 1
+  return v
+_group_key = 1
+_group_key_table = dict()
+def collective_group_key(devices):
+  """Returns a group key for the set of devices.
+  Args:
+    devices: list of strings naming devices in a collective group.
+  Returns:
+    int key uniquely identifying the set of device names.
+  """
+  global _group_key
+  global _group_key_table
+  parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
+  names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
+  concat = ','.join(names)
+  if concat not in _group_key_table.keys():
+    new_key = _group_key
+    _group_key += 1
+    _group_key_table[concat] = new_key
+  rv = _group_key_table[concat]
+  return rv
+def build_collective_reduce(input_tensors, num_workers, num_shards,
+                            red_op='Add', un_op='Id'):
+  """Build a subgraph that does one full all-reduce, using the collective Op.
+  Args:
+    input_tensors: tensors within a single worker graph that are to be reduced
+      together; must be one per device.
+    num_workers: total number of workers with identical independent graphs that
+      will be doing this same reduction.  The reduction will actually include
+      the corresponding tensors at all these workers.
+    num_shards: number of shards into which to divide each per-tick chunk,
+      normally 1 but could be higher on multi-data-path architectures.
+    red_op: string naming the reduction op
+    un_op: string naming the unary final op
+  Returns:
+    An array of final tensors, one per device, computed by the full reduction.
+  Raises:
+    ValueError: There must be at least two tensors over all the workers.
+  """
+  group_size = len(input_tensors) * num_workers
+  if group_size < 2:
+    raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
+  devices = [t.device for t in input_tensors]
+  num_devices = len(devices)
+  group_key = collective_group_key(devices)
+  instance_key = new_collective_instance_key()
+  out_tensors = []
+  if num_shards == 1:
+    subdiv_offsets = [0]
+  elif num_shards == 2:
+    if num_devices > 1:
+      subdiv_offsets = [0, -(num_devices // 2)]
+    else:
+      subdiv_offsets = [0]
+  else:
+    raise ValueError('Unsupported num_shards %d' % num_shards)
+  for d in range(num_devices):
+    with ops.device(devices[d]):
+      reduce_op = collective_ops.all_reduce(input_tensors[d],
+                                            group_size, group_key, instance_key,
+                                            red_op, un_op,
+                                            subdiv_offsets)
+      out_tensors.append(reduce_op)
+  return out_tensors
+def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
+  return collective_ops.broadcast_send(t, shape, dtype, group_size, group_key,
+                                       instance_key)
+def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
+  return collective_ops.broadcast_recv(shape, dtype, group_size, group_key,
+                                       instance_key)
+def sum_grad_and_var_all_reduce(single_session,
+                                grad_and_vars,
+                                num_workers,
+                                alg,
+                                gpu_indices,
+                                aux_devices=None,
+                                num_shards=1):
+  """Apply all-reduce algorithm over specified gradient tensors."""
+  scaled_grads = [g for g, _ in grad_and_vars]
+  if alg == 'collective':
+    assert not single_session
+    summed_grads = build_collective_reduce(
+        scaled_grads, num_workers, num_shards, 'Add', 'Id')
+  else:
+    with tf.name_scope('allreduce'):
+      # Note that each grad_and_vars looks like the following:
+      #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+      if alg == 'nccl':
+        summed_grads = all_reduce.build_nccl_all_reduce(scaled_grads, tf.add)
+      elif alg == 'xring':
+        summed_grads = all_reduce.build_ring_all_reduce(
+            scaled_grads, num_workers, num_shards, gpu_indices, tf.add)
+      elif alg == 'nccl/xring':
+        summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
+                                                       tf.add)
+      elif alg == 'nccl/rechd':
+        summed_grads = all_reduce.build_nccl_then_recursive_hd(
+            scaled_grads, tf.add)
+      elif alg == 'nccl/pscpu':
+        summed_grads = all_reduce.build_nccl_then_shuffle(
+            scaled_grads, aux_devices, tf.add, tf.add_n)
+      elif alg == 'pscpu/pscpu':
+        summed_grads = all_reduce.build_shuffle_then_shuffle(
+            scaled_grads,
+            aux_devices,
+            # TODO(tucker): devise a way of better specifying the device set
+            # for the second level.
+            [aux_devices[0]],
+            tf.add_n)
+      elif alg in ['pscpu', 'psgpu']:
+        summed_grads = all_reduce.build_shuffle_all_reduce(
+            scaled_grads, aux_devices, tf.add_n)
+      else:
+        raise ValueError('unsupported all_reduce alg: ', alg)
+  result = []
+  for (_, v), g in zip(grad_and_vars, summed_grads):
+    result.append([g, v])
+  return result
+def contains_any(haystack, needles):
+  """Tests if any needle is a substring of haystack.
+  Args:
+    haystack: a string
+    needles: list of strings
+  Returns:
+    True if any element of needles is a substring of haystack,
+      False otherwise.
+  """
+  for n in needles:
+    if n in haystack:
+      return True
+  return False
+def sum_gradients_all_reduce(single_session,
+                             dev_prefixes,
+                             tower_grads,
+                             num_workers,
+                             alg,
+                             num_shards,
+                             gpu_indices,
+                             agg_small_grads_max_bytes=0,
+                             agg_small_grads_max_group=10,
+                             allreduce_merge_scope=1):
+  """Apply all-reduce algorithm over specified gradient tensors.
+  Args:
+    single_session: true if reduction is applied to one graph across
+      all workers, false if ths application is to a single-worker graph only.
+    dev_prefixes: list of prefix strings to use to generate PS device names.
+    tower_grads: the gradients to reduce.
+    num_workers: number of worker processes across entire job.
+    alg: the all-reduce algorithm to apply.
+    num_shards: alg-specific sharding factor.
+    gpu_indices: indices of local GPUs in order usable for ring-reduce.
+    agg_small_grads_max_bytes: largest tensor eligible for aggregation,
+      in number of bytes.
+    agg_small_grads_max_group: largest permitted aggregation of small
+      tensors.
+    allreduce_merge_scope: size of groups into which to partition consecutive
+      gradients grouped under a common 'allreduce' name scope for application
+      of ScopedAllocator optimization.
+  Returns:
+    list of reduced tensors
+  """
+  alg_contains_shuffle = contains_any(alg, ['pscpu', 'psgpu'])
+  is_hierarchical = '/' in alg
+  if 'pscpu' in alg:
+    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
+  elif 'psgpu' in alg:
+    aux_devices = [
+        prefix + '/gpu:%d' % i
+        for i in range(len(gpu_indices))
+        for prefix in dev_prefixes
+    ]
+  else:
+    aux_devices = ['/job:localhost/cpu:0']
+  aux_device_groups = group_device_names(
+      aux_devices,
+      num_shards if (alg != 'collective' and alg_contains_shuffle) else 1)
+  group_index = 0
+  if agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
+    tower_grads, packing = pack_small_tensors(
+        tower_grads,
+        max_bytes=agg_small_grads_max_bytes,
+        max_group=agg_small_grads_max_group)
+  else:
+    packing = None
+  reduced_gv_list = []
+  gv = list(zip(*tower_grads))
+  merge_scope = allreduce_merge_scope if allreduce_merge_scope > 0 else 1
+  chunked_gv = [gv[x:x + merge_scope]
+                for x in xrange(0, len(gv), merge_scope)]
+  for chunk in chunked_gv:
+    with tf.name_scope('allreduce'):
+      for grad_and_vars in chunk:
+        reduced_gv_list.append(sum_grad_and_var_all_reduce(
+            single_session,
+            grad_and_vars, num_workers, alg, gpu_indices,
+            (aux_devices if is_hierarchical
+             else aux_device_groups[group_index]),
+            num_shards))
+        group_index = (group_index + 1) % len(aux_device_groups)
+  new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
+  if packing:
+    new_tower_grads = unpack_small_tensors(new_tower_grads, packing)
+  return new_tower_grads
+def extract_ranges(index_list, range_size_limit=32):
+  """Extract consecutive ranges and singles from index_list.
+  Args:
+    index_list: List of monotone increasing non-negative integers.
+    range_size_limit: Largest size range to return.  If a larger
+      consecutive range exists it will be returned as multiple
+      ranges.
+  Returns:
+   ranges, singles where ranges is a list of [first, last] pairs of
+     consecutive elements in index_list, and singles is all of the
+     other elements, in original order.
+  """
+  if not index_list:
+    return [], []
+  first = index_list[0]
+  last = first
+  ranges = []
+  singles = []
+  for i in index_list[1:]:
+    if i == last + 1 and (last - first) <= range_size_limit:
+      last = i
+    else:
+      if last > first:
+        ranges.append([first, last])
+      else:
+        singles.append(first)
+      first = i
+      last = i
+  if last > first:
+    ranges.append([first, last])
+  else:
+    singles.append(first)
+  return ranges, singles
+GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
+def pack_range(key, packing, grad_vars, rng):
+  """Form the concatenation of a specified range of gradient tensors.
+  Args:
+    key: Value under which to store meta-data in packing that will be used
+      later to restore the grad_var list structure.
+    packing: Dict holding data describing packed ranges of small tensors.
+    grad_vars: List of (grad, var) pairs for one tower.
+    rng: A pair of integers giving the first, last indices of a consecutive
+      range of tensors to be packed.
+  Returns:
+    A tensor that is the concatenation of all the specified small tensors.
+  """
+  to_pack = grad_vars[rng[0]:rng[1] + 1]
+  members = []
+  variables = []
+  restore_shapes = []
+  with tf.name_scope('pack'):
+    for g, v in to_pack:
+      variables.append(v)
+      restore_shapes.append(g.shape)
+      with tf.device(g.device):
+        members.append(tf.reshape(g, [-1]))
+    packing[key] = GradPackTuple(
+        indices=range(rng[0], rng[1] + 1),
+        vars=variables,
+        shapes=restore_shapes)
+    with tf.device(members[0].device):
+      return tf.concat(members, 0)
+def unpack_grad_tuple(gv, gpt):
+  """Unpack a previously packed collection of gradient tensors.
+  Args:
+    gv: A (grad, var) pair to be unpacked.
+    gpt: A GradPackTuple describing the packing operation that produced gv.
+  Returns:
+    A list of (grad, var) pairs corresponding to the values that were
+     originally packed into gv, maybe following subsequent operations like
+     reduction.
+  """
+  elt_widths = [x.num_elements() for x in gpt.shapes]
+  with tf.device(gv[0][0].device):
+    with tf.name_scope('unpack'):
+      splits = tf.split(gv[0], elt_widths)
+      unpacked_gv = []
+      for idx, s in enumerate(splits):
+        unpacked_gv.append((tf.reshape(s, gpt.shapes[idx]), gpt.vars[idx]))
+  return unpacked_gv
+def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
+  """Concatenate small gradient tensors together for reduction.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples.
+    max_bytes: Int giving max number of bytes in a tensor that
+      may be considered small.
+    max_group: Int giving max number of small tensors that may be
+      concatenated into one new tensor.
+  Returns:
+    new_tower_grads, packing where new_tower_grads is identical to
+      tower_grads except that all feasible small_tensors have been removed
+      from their places and concatenated into larger tensors that are
+      now in the front of the list for each tower, and packing contains
+      the data necessary to restore the tower_grads structure.
+  Look through the first tower for gradients of the same type (float),
+  and small size, that are all sequential.  For each such group,
+  replace by a new tensor that is a flattened concatenation.  Note
+  that the corresponding variable will be absent, which doesn't matter
+  because it isn't used during all-reduce.
+  Requires:
+    Every gv_list in towers must have isomorphic structure including identical
+      tensor sizes and types.
+  """
+  small_indices = []
+  large_indices = []
+  for idx, (g, _) in enumerate(tower_grads[0]):
+    if g.dtype == tf.float32 and (4 * g.shape.num_elements()) <= max_bytes:
+      small_indices.append(idx)
+    else:
+      large_indices.append(idx)
+  small_ranges, small_singles = extract_ranges(
+      small_indices, range_size_limit=max_group)
+  large_indices = sorted(large_indices + small_singles)
+  num_gv = len(tower_grads[0])
+  packing = {}
+  if small_ranges:
+    new_tower_grads = []
+    for dev_idx, gv_list in enumerate(tower_grads):
+      assert len(gv_list) == num_gv
+      new_gv_list = []
+      for r in small_ranges:
+        key = '%d:%d' % (dev_idx, len(new_gv_list))
+        new_gv_list.append((pack_range(key, packing, gv_list, r),
+                            'packing_var_placeholder'))
+      for i in large_indices:
+        new_gv_list.append(gv_list[i])
+      new_tower_grads.append(new_gv_list)
+    return new_tower_grads, packing
+  else:
+    return tower_grads, None
+def unpack_small_tensors(tower_grads, packing):
+  """Undo the structure alterations to tower_grads done by pack_small_tensors.
+  Args:
+    tower_grads: List of List of (grad, var) tuples.
+    packing: A dict generated by pack_small_tensors describing the changes
+      it made to tower_grads.
+  Returns:
+    new_tower_grads: identical to tower_grads except that concatentations
+      of small tensors have been split apart and returned to their original
+      positions, paired with their original variables.
+  """
+  if not packing:
+    return tower_grads
+  new_tower_grads = []
+  num_devices = len(tower_grads)
+  num_packed = len(packing.keys()) // num_devices
+  for dev_idx, gv_list in enumerate(tower_grads):
+    new_gv_list = gv_list[num_packed:]
+    for i in xrange(0, num_packed):
+      k = '%d:%d' % (dev_idx, i)
+      gpt = packing[k]
+      gv = unpack_grad_tuple(gv_list[i], gpt)
+      for gi, idx in enumerate(gpt.indices):
+        assert idx == gpt.indices[gi]
+        new_gv_list.insert(idx, gv[gi])
+    new_tower_grads.append(new_gv_list)
+  return new_tower_grads