new tf branch for dtk21.10.1

ee3997b3 · qianyj · 2795dc1f · 2795dc1f · 2795dc1f · 2795dc1f
Commit ee3997b3 authored Apr 15, 2022 by qianyj
20 changed files
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/models/tf1_only/ssd_model.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/models/tf1_only/ssd_model.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-
-"""SSD300 Model Configuration.
-
-References:
-  Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
-  Cheng-Yang Fu, Alexander C. Berg
-  SSD: Single Shot MultiBox Detector
-  arXiv:1512.02325
-
-Ported from MLPerf reference implementation:
-  https://github.com/mlperf/reference/tree/ssd/single_stage_detector/ssd
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import multiprocessing
-import os
-import re
-import threading
-import tensorflow.compat.v1 as tf
-
-# pylint: disable=g-direct-tensorflow-import
-import constants
-import mlperf
-import ssd_constants
-from cnn_util import log_fn
-from models import model as model_lib
-from models import resnet_model
-from tensorflow.contrib import layers as contrib_layers
-from tensorflow.python.ops import variables
-
-BACKBONE_MODEL_SCOPE_NAME = 'resnet34_backbone'
-
-
-class SSD300Model(model_lib.CNNModel):
-  """Single Shot Multibox Detection (SSD) model for 300x300 image datasets."""
-
-  def __init__(self, label_num=ssd_constants.NUM_CLASSES, batch_size=32,
-               learning_rate=1e-3, backbone='resnet34', params=None):
-    super(SSD300Model, self).__init__('ssd300', 300, batch_size, learning_rate,
-                                      params=params)
-    # For COCO dataset, 80 categories + 1 background = 81 labels
-    self.label_num = label_num
-
-    # Currently only support ResNet-34 as backbone model
-    if backbone != 'resnet34':
-      raise ValueError('Invalid backbone model %s for SSD.' % backbone)
-    mlperf.logger.log(key=mlperf.tags.BACKBONE, value=backbone)
-
-    # Number of channels and default boxes associated with the following layers:
-    #   ResNet34 layer, Conv7, Conv8_2, Conv9_2, Conv10_2, Conv11_2
-    self.out_chan = [256, 512, 512, 256, 256, 256]
-    mlperf.logger.log(key=mlperf.tags.LOC_CONF_OUT_CHANNELS,
-                      value=self.out_chan)
-
-    # Number of default boxes from layers of different scales
-    #   38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
-    self.num_dboxes = [4, 6, 6, 6, 4, 4]
-    mlperf.logger.log(key=mlperf.tags.NUM_DEFAULTS_PER_CELL,
-                      value=self.num_dboxes)
-
-    # TODO(haoyuzhang): in order to correctly restore in replicated mode, need
-    # to create a saver for each tower before graph is finalized. Use variable
-    # manager for better efficiency.
-    self.backbone_savers = []
-
-    # Collected predictions for eval stage. It maps each image id in eval
-    # dataset to a dict containing the following information:
-    #   source_id: raw ID of image
-    #   raw_shape: raw shape of image
-    #   pred_box: encoded box coordinates of prediction
-    #   pred_scores: scores of classes in prediction
-    self.predictions = {}
-
-    # Global step when predictions are collected.
-    self.eval_global_step = 0
-
-    # Average precision. In asynchronous eval mode, this is the latest AP we
-    # get so far and may not be the results at current eval step.
-    self.eval_coco_ap = 0
-
-    # Process, queues, and thread for asynchronous evaluation. When enabled,
-    # create a separate process (async_eval_process) that continuously pull
-    # intermediate results from the predictions queue (a multiprocessing queue),
-    # process them, and push final results into results queue (another
-    # multiprocessing queue). The main thread is responsible to push message
-    # into predictions queue, and start a separate thread to continuously pull
-    # messages from results queue to update final results.
-    # Message in predictions queue should be a tuple of two elements:
-    #    (evaluation step, predictions)
-    # Message in results queue should be a tuple of two elements:
-    #    (evaluation step, final results)
-    self.async_eval_process = None
-    self.async_eval_predictions_queue = None
-    self.async_eval_results_queue = None
-    self.async_eval_results_getter_thread = None
-
-    # The MLPerf reference uses a starting lr of 1e-3 at bs=32.
-    self.base_lr_batch_size = 32
-
-  def skip_final_affine_layer(self):
-    return True
-
-  def gpu_preprocess_nhwc(self, images, phase_train=True):
-    try:
-      import ssd_dataloader  # pylint: disable=g-import-not-at-top
-    except ImportError:
-      raise ImportError('To use the COCO dataset, you must clone the '
-                        'repo https://github.com/tensorflow/models and add '
-                        'tensorflow/models and tensorflow/models/research to '
-                        'the PYTHONPATH, and compile the protobufs by '
-                        'following https://github.com/tensorflow/models/blob/'
-                        'master/research/object_detection/g3doc/installation.md'
-                        '#protobuf-compilation ; To evaluate using COCO'
-                        'metric, download and install Python COCO API from'
-                        'https://github.com/cocodataset/cocoapi')
-
-    if phase_train:
-      images = ssd_dataloader.color_jitter(
-          images, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05)
-      images = ssd_dataloader.normalize_image(images)
-    return images
-
-  def add_backbone_model(self, cnn):
-    # --------------------------------------------------------------------------
-    # Resnet-34 backbone model -- modified for SSD
-    # --------------------------------------------------------------------------
-
-    # Input 300x300, output 150x150
-    cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True)
-    cnn.mpool(3, 3, 2, 2, mode='SAME')
-
-    resnet34_layers = [3, 4, 6, 3]
-    version = 'v1'
-
-    # ResNet-34 block group 1
-    # Input 150x150, output 75x75
-    for i in range(resnet34_layers[0]):
-      # Last argument forces residual_block to use projection shortcut, even
-      # though the numbers of input and output channels are equal
-      resnet_model.residual_block(cnn, 64, 1, version)
-
-    # ResNet-34 block group 2
-    # Input 75x75, output 38x38
-    for i in range(resnet34_layers[1]):
-      stride = 2 if i == 0 else 1
-      resnet_model.residual_block(cnn, 128, stride, version, i == 0)
-
-    # ResNet-34 block group 3
-    # This block group is modified: first layer uses stride=1 so that the image
-    # size does not change in group of layers
-    # Input 38x38, output 38x38
-    for i in range(resnet34_layers[2]):
-      # The following line is intentionally commented out to differentiate from
-      # the original ResNet-34 model
-      # stride = 2 if i == 0 else 1
-      resnet_model.residual_block(cnn, 256, stride, version, i == 0)
-
-    # ResNet-34 block group 4: removed final block group
-    # The following 3 lines are intentionally commented out to differentiate
-    # from the original ResNet-34 model
-    # for i in range(resnet34_layers[3]):
-    #   stride = 2 if i == 0 else 1
-    #   resnet_model.residual_block(cnn, 512, stride, version, i == 0)
-
-  def add_inference(self, cnn):
-    cnn.use_batch_norm = True
-    cnn.batch_norm_config = {'decay': ssd_constants.BATCH_NORM_DECAY,
-                             'epsilon': ssd_constants.BATCH_NORM_EPSILON,
-                             'scale': True}
-
-    with tf.variable_scope(BACKBONE_MODEL_SCOPE_NAME):
-      self.add_backbone_model(cnn)
-
-    # --------------------------------------------------------------------------
-    # SSD additional layers
-    # --------------------------------------------------------------------------
-
-    def add_ssd_layer(cnn, depth, k_size, stride, mode):
-      return cnn.conv(
-          depth,
-          k_size,
-          k_size,
-          stride,
-          stride,
-          mode=mode,
-          use_batch_norm=False,
-          kernel_initializer=contrib_layers.xavier_initializer())
-
-    # Activations for feature maps of different layers
-    self.activations = [cnn.top_layer]
-    # Conv7_1, Conv7_2
-    # Input 38x38, output 19x19
-    add_ssd_layer(cnn, 256, 1, 1, 'valid')
-    self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))
-
-    # Conv8_1, Conv8_2
-    # Input 19x19, output 10x10
-    add_ssd_layer(cnn, 256, 1, 1, 'valid')
-    self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))
-
-    # Conv9_1, Conv9_2
-    # Input 10x10, output 5x5
-    add_ssd_layer(cnn, 128, 1, 1, 'valid')
-    self.activations.append(add_ssd_layer(cnn, 256, 3, 2, 'same'))
-
-    # Conv10_1, Conv10_2
-    # Input 5x5, output 3x3
-    add_ssd_layer(cnn, 128, 1, 1, 'valid')
-    self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))
-
-    # Conv11_1, Conv11_2
-    # Input 3x3, output 1x1
-    add_ssd_layer(cnn, 128, 1, 1, 'valid')
-    self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))
-
-    self.loc = []
-    self.conf = []
-
-    for nd, ac, oc in zip(self.num_dboxes, self.activations, self.out_chan):
-      l = cnn.conv(
-          nd * 4,
-          3,
-          3,
-          1,
-          1,
-          input_layer=ac,
-          num_channels_in=oc,
-          activation=None,
-          use_batch_norm=False,
-          kernel_initializer=contrib_layers.xavier_initializer())
-      scale = l.get_shape()[-1]
-      # shape = [batch_size, nd * 4, scale, scale]
-      l = tf.reshape(l, [self.batch_size, nd, 4, scale, scale])
-      # shape = [batch_size, nd, 4, scale, scale]
-      l = tf.transpose(l, [0, 1, 3, 4, 2])
-      # shape = [batch_size, nd, scale, scale, 4]
-      self.loc.append(tf.reshape(l, [self.batch_size, -1, 4]))
-      # shape = [batch_size, nd * scale * scale, 4]
-
-      c = cnn.conv(
-          nd * self.label_num,
-          3,
-          3,
-          1,
-          1,
-          input_layer=ac,
-          num_channels_in=oc,
-          activation=None,
-          use_batch_norm=False,
-          kernel_initializer=contrib_layers.xavier_initializer())
-      # shape = [batch_size, nd * label_num, scale, scale]
-      c = tf.reshape(c, [self.batch_size, nd, self.label_num, scale, scale])
-      # shape = [batch_size, nd, label_num, scale, scale]
-      c = tf.transpose(c, [0, 1, 3, 4, 2])
-      # shape = [batch_size, nd, scale, scale, label_num]
-      self.conf.append(tf.reshape(c, [self.batch_size, -1, self.label_num]))
-      # shape = [batch_size, nd * scale * scale, label_num]
-
-    # Shape of locs: [batch_size, NUM_SSD_BOXES, 4]
-    # Shape of confs: [batch_size, NUM_SSD_BOXES, label_num]
-    locs, confs = tf.concat(self.loc, 1), tf.concat(self.conf, 1)
-
-    # Pack location and confidence outputs into a single output layer
-    # Shape of logits: [batch_size, NUM_SSD_BOXES, 4+label_num]
-    logits = tf.concat([locs, confs], 2)
-
-    cnn.top_layer = logits
-    cnn.top_size = 4 + self.label_num
-
-    return cnn.top_layer
-
-  def get_learning_rate(self, global_step, batch_size):
-    rescaled_lr = self.get_scaled_base_learning_rate(batch_size)
-    # Defined in MLPerf reference model
-    boundaries = [160000, 200000]
-    boundaries = [b * self.base_lr_batch_size // batch_size for b in boundaries]
-    decays = [1, 0.1, 0.01]
-    learning_rates = [rescaled_lr * d for d in decays]
-    lr = tf.train.piecewise_constant(global_step, boundaries, learning_rates)
-    warmup_steps = int(118287 / batch_size * 5)
-    warmup_lr = (
-        rescaled_lr * tf.cast(global_step, tf.float32) / tf.cast(
-            warmup_steps, tf.float32))
-    return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr)
-
-  def get_scaled_base_learning_rate(self, batch_size):
-    """Calculates base learning rate for creating lr schedule.
-
-    In replicated mode, gradients are summed rather than averaged which, with
-    the sgd and momentum optimizers, increases the effective learning rate by
-    lr * num_gpus. Dividing the base lr by num_gpus negates the increase.
-
-    Args:
-      batch_size: Total batch-size.
-
-    Returns:
-      Base learning rate to use to create lr schedule.
-    """
-    base_lr = self.learning_rate
-    if self.params.variable_update == 'replicated':
-      base_lr = self.learning_rate / self.params.num_gpus
-    scaled_lr = base_lr * (batch_size / self.base_lr_batch_size)
-    return scaled_lr
-
-  def _collect_backbone_vars(self):
-    backbone_vars = tf.get_collection(
-        tf.GraphKeys.GLOBAL_VARIABLES, scope='.*'+ BACKBONE_MODEL_SCOPE_NAME)
-    var_list = {}
-
-    # Assume variables in the checkpoint are following the naming convention of
-    # a model checkpoint trained with TF official model
-    # TODO(haoyuzhang): the following variable name parsing is hacky and easy
-    # to break if there is change in naming convention of either benchmarks or
-    # official models.
-    for v in backbone_vars:
-      # conv2d variable example (model <-- checkpoint):
-      #   v/cg/conv24/conv2d/kernel:0 <-- conv2d_24/kernel
-      if 'conv2d' in v.name:
-        re_match = re.search(r'conv(\d+)/conv2d/(.+):', v.name)
-        if re_match:
-          layer_id = int(re_match.group(1))
-          param_name = re_match.group(2)
-          vname_in_ckpt = self._var_name_in_official_model_ckpt(
-              'conv2d', layer_id, param_name)
-          var_list[vname_in_ckpt] = v
-
-      # batchnorm varariable example:
-      #   v/cg/conv24/batchnorm25/gamma:0 <-- batch_normalization_25/gamma
-      elif 'batchnorm' in v.name:
-        re_match = re.search(r'batchnorm(\d+)/(.+):', v.name)
-        if re_match:
-          layer_id = int(re_match.group(1))
-          param_name = re_match.group(2)
-          vname_in_ckpt = self._var_name_in_official_model_ckpt(
-              'batch_normalization', layer_id, param_name)
-          var_list[vname_in_ckpt] = v
-
-    return var_list
-
-  def _var_name_in_official_model_ckpt(self, layer_name, layer_id, param_name):
-    """Return variable names according to convention in TF official models."""
-    vname_in_ckpt = layer_name
-    if layer_id > 0:
-      vname_in_ckpt += '_' + str(layer_id)
-    vname_in_ckpt += '/' + param_name
-    return vname_in_ckpt
-
-  def loss_function(self, inputs, build_network_result):
-    logits = build_network_result.logits
-
-    # Unpack model output back to locations and confidence scores of predictions
-    # Shape of pred_loc: [batch_size, NUM_SSD_BOXES, 4]
-    # Shape of pred_label: [batch_size, NUM_SSD_BOXES, label_num]
-    pred_loc, pred_label = tf.split(logits, [4, self.label_num], 2)
-
-    # Shape of gt_loc: [batch_size, NUM_SSD_BOXES, 4]
-    # Shape of gt_label: [batch_size, NUM_SSD_BOXES, 1]
-    # Shape of num_gt: [batch_size]
-    _, gt_loc, gt_label, num_gt = inputs
-    gt_label = tf.cast(gt_label, tf.int32)
-
-    box_loss = self._localization_loss(pred_loc, gt_loc, gt_label, num_gt)
-    class_loss = self._classification_loss(pred_label, gt_label, num_gt)
-
-    tf.summary.scalar('box_loss', tf.reduce_mean(box_loss))
-    tf.summary.scalar('class_loss', tf.reduce_mean(class_loss))
-    return class_loss + box_loss
-
-  def _localization_loss(self, pred_loc, gt_loc, gt_label, num_matched_boxes):
-    """Computes the localization loss.
-
-    Computes the localization loss using smooth l1 loss.
-    Args:
-      pred_loc: a flatten tensor that includes all predicted locations. The
-        shape is [batch_size, num_anchors, 4].
-      gt_loc: a tensor representing box regression targets in
-        [batch_size, num_anchors, 4].
-      gt_label: a tensor that represents the classification groundtruth targets.
-        The shape is [batch_size, num_anchors, 1].
-      num_matched_boxes: the number of anchors that are matched to a groundtruth
-        targets, used as the loss normalizater. The shape is [batch_size].
-    Returns:
-      box_loss: a float32 representing total box regression loss.
-    """
-    mask = tf.greater(tf.squeeze(gt_label), 0)
-    float_mask = tf.cast(mask, tf.float32)
-
-    smooth_l1 = tf.reduce_sum(tf.losses.huber_loss(
-        gt_loc, pred_loc,
-        reduction=tf.losses.Reduction.NONE
-    ), axis=2)
-    smooth_l1 = tf.multiply(smooth_l1, float_mask)
-    box_loss = tf.reduce_sum(smooth_l1, axis=1)
-
-    return tf.reduce_mean(box_loss / num_matched_boxes)
-
-  def _classification_loss(self, pred_label, gt_label, num_matched_boxes):
-    """Computes the classification loss.
-
-    Computes the classification loss with hard negative mining.
-    Args:
-      pred_label: a flatten tensor that includes all predicted class. The shape
-        is [batch_size, num_anchors, num_classes].
-      gt_label: a tensor that represents the classification groundtruth targets.
-        The shape is [batch_size, num_anchors, 1].
-      num_matched_boxes: the number of anchors that are matched to a groundtruth
-        targets. This is used as the loss normalizater.
-
-    Returns:
-      box_loss: a float32 representing total box regression loss.
-    """
-    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
-        gt_label, pred_label, reduction=tf.losses.Reduction.NONE)
-
-    mask = tf.greater(tf.squeeze(gt_label), 0)
-    float_mask = tf.cast(mask, tf.float32)
-
-    # Hard example mining
-    neg_masked_cross_entropy = cross_entropy * (1 - float_mask)
-    relative_position = tf.argsort(
-        tf.argsort(
-            neg_masked_cross_entropy, direction='DESCENDING'))
-    num_neg_boxes = tf.minimum(
-        tf.to_int32(num_matched_boxes) * ssd_constants.NEGS_PER_POSITIVE,
-        ssd_constants.NUM_SSD_BOXES)
-    top_k_neg_mask = tf.cast(tf.less(
-        relative_position,
-        tf.tile(num_neg_boxes[:, tf.newaxis], (1, ssd_constants.NUM_SSD_BOXES))
-    ), tf.float32)
-
-    class_loss = tf.reduce_sum(
-        tf.multiply(cross_entropy, float_mask + top_k_neg_mask), axis=1)
-
-    return tf.reduce_mean(class_loss / num_matched_boxes)
-
-  def add_backbone_saver(self):
-    # Create saver with mapping from variable names in checkpoint of backbone
-    # model to variables in SSD model
-    backbone_var_list = self._collect_backbone_vars()
-    self.backbone_savers.append(tf.train.Saver(backbone_var_list))
-
-  def load_backbone_model(self, sess, backbone_model_path):
-    for saver in self.backbone_savers:
-      saver.restore(sess, backbone_model_path)
-
-  def get_input_data_types(self, subset):
-    if subset == 'validation':
-      return [self.data_type, tf.float32, tf.float32, tf.float32, tf.int32]
-    return [self.data_type, tf.float32, tf.float32, tf.float32]
-
-  def get_input_shapes(self, subset):
-    """Return encoded tensor shapes for train and eval data respectively."""
-    if subset == 'validation':
-      # Validation data shapes:
-      # 1. images
-      # 2. ground truth locations of boxes
-      # 3. ground truth classes of objects in boxes
-      # 4. source image IDs
-      # 5. raw image shapes
-      return [
-          [self.batch_size, self.image_size, self.image_size, self.depth],
-          [self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 4],
-          [self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 1],
-          [self.batch_size],
-          [self.batch_size, 3],
-      ]
-
-    # Training data shapes:
-    # 1. images
-    # 2. ground truth locations of boxes
-    # 3. ground truth classes of objects in boxes
-    # 4. numbers of objects in images
-    return [
-        [self.batch_size, self.image_size, self.image_size, self.depth],
-        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 4],
-        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 1],
-        [self.batch_size]
-    ]
-
-  def accuracy_function(self, inputs, logits):
-    """Returns the ops to measure the mean precision of the model."""
-    try:
-      import ssd_dataloader  # pylint: disable=g-import-not-at-top
-      from object_detection.box_coders import faster_rcnn_box_coder  # pylint: disable=g-import-not-at-top
-      from object_detection.core import box_coder  # pylint: disable=g-import-not-at-top
-      from object_detection.core import box_list  # pylint: disable=g-import-not-at-top
-    except ImportError:
-      raise ImportError('To use the COCO dataset, you must clone the '
-                        'repo https://github.com/tensorflow/models and add '
-                        'tensorflow/models and tensorflow/models/research to '
-                        'the PYTHONPATH, and compile the protobufs by '
-                        'following https://github.com/tensorflow/models/blob/'
-                        'master/research/object_detection/g3doc/installation.md'
-                        '#protobuf-compilation ; To evaluate using COCO'
-                        'metric, download and install Python COCO API from'
-                        'https://github.com/cocodataset/cocoapi')
-
-    # Unpack model output back to locations and confidence scores of predictions
-    # pred_locs: relative locations (coordinates) of objects in all SSD boxes
-    # shape: [batch_size, NUM_SSD_BOXES, 4]
-    # pred_labels: confidence scores of objects being of all categories
-    # shape: [batch_size, NUM_SSD_BOXES, label_num]
-    pred_locs, pred_labels = tf.split(logits, [4, self.label_num], 2)
-
-    ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
-        scale_factors=ssd_constants.BOX_CODER_SCALES)
-    anchors = box_list.BoxList(
-        tf.convert_to_tensor(ssd_dataloader.DefaultBoxes()('ltrb')))
-    pred_boxes = box_coder.batch_decode(
-        encoded_boxes=pred_locs, box_coder=ssd_box_coder, anchors=anchors)
-
-    pred_scores = tf.nn.softmax(pred_labels, axis=2)
-
-    # TODO(haoyuzhang): maybe use `gt_boxes` and `gt_classes` for visualization.
-    _, gt_boxes, gt_classes, source_id, raw_shape = inputs  # pylint: disable=unused-variable
-
-    return {
-        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
-         ssd_constants.PRED_BOXES): pred_boxes,
-        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
-         ssd_constants.PRED_SCORES): pred_scores,
-        # TODO(haoyuzhang): maybe use these values for visualization.
-        # constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_boxes': gt_boxes,
-        # constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_classes': gt_classes,
-        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
-         ssd_constants.SOURCE_ID): source_id,
-        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
-         ssd_constants.RAW_SHAPE): raw_shape
-    }
-
-  def postprocess(self, results):
-    """Postprocess results returned from model."""
-    try:
-      import coco_metric  # pylint: disable=g-import-not-at-top
-    except ImportError:
-      raise ImportError('To use the COCO dataset, you must clone the '
-                        'repo https://github.com/tensorflow/models and add '
-                        'tensorflow/models and tensorflow/models/research to '
-                        'the PYTHONPATH, and compile the protobufs by '
-                        'following https://github.com/tensorflow/models/blob/'
-                        'master/research/object_detection/g3doc/installation.md'
-                        '#protobuf-compilation ; To evaluate using COCO'
-                        'metric, download and install Python COCO API from'
-                        'https://github.com/cocodataset/cocoapi')
-
-    pred_boxes = results[ssd_constants.PRED_BOXES]
-    pred_scores = results[ssd_constants.PRED_SCORES]
-    # TODO(haoyuzhang): maybe use these values for visualization.
-    # gt_boxes = results['gt_boxes']
-    # gt_classes = results['gt_classes']
-    source_id = results[ssd_constants.SOURCE_ID]
-    raw_shape = results[ssd_constants.RAW_SHAPE]
-
-    # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due
-    # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting
-    # `num_eval_epochs` to 1 is not enough and will often miss some images. We
-    # expect user to set `num_eval_epochs` to >1, which will leave some unused
-    # images from previous steps in `predictions`. Here we check if we are doing
-    # eval at a new global step.
-    if results['global_step'] > self.eval_global_step:
-      self.eval_global_step = results['global_step']
-      self.predictions.clear()
-
-    for i, sid in enumerate(source_id):
-      self.predictions[int(sid)] = {
-          ssd_constants.PRED_BOXES: pred_boxes[i],
-          ssd_constants.PRED_SCORES: pred_scores[i],
-          ssd_constants.SOURCE_ID: source_id[i],
-          ssd_constants.RAW_SHAPE: raw_shape[i]
-      }
-
-    # COCO metric calculates mAP only after a full epoch of evaluation. Return
-    # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py.
-    if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES:
-      log_fn('Got results for all {:d} eval examples. Calculate mAP...'.format(
-          ssd_constants.COCO_NUM_VAL_IMAGES))
-
-      annotation_file = os.path.join(self.params.data_dir,
-                                     ssd_constants.ANNOTATION_FILE)
-      # Size of predictions before decoding about 15--30GB, while size after
-      # decoding is 100--200MB. When using async eval mode, decoding takes
-      # 20--30 seconds of main thread time but is necessary to avoid OOM during
-      # inter-process communication.
-      decoded_preds = coco_metric.decode_predictions(self.predictions.values())
-      self.predictions.clear()
-
-      if self.params.collect_eval_results_async:
-        def _eval_results_getter():
-          """Iteratively get eval results from async eval process."""
-          while True:
-            step, eval_results = self.async_eval_results_queue.get()
-            self.eval_coco_ap = eval_results['COCO/AP']
-            mlperf.logger.log_eval_accuracy(
-                self.eval_coco_ap, step, self.batch_size * self.params.num_gpus,
-                ssd_constants.COCO_NUM_TRAIN_IMAGES)
-            if self.reached_target():
-              # Reached target, clear all pending messages in predictions queue
-              # and insert poison pill to stop the async eval process.
-              while not self.async_eval_predictions_queue.empty():
-                self.async_eval_predictions_queue.get()
-              self.async_eval_predictions_queue.put('STOP')
-              break
-
-        if not self.async_eval_process:
-          # Limiting the number of messages in predictions queue to prevent OOM.
-          # Each message (predictions data) can potentially consume a lot of
-          # memory, and normally there should only be few messages in the queue.
-          # If often blocked on this, consider reducing eval frequency.
-          self.async_eval_predictions_queue = multiprocessing.Queue(2)
-          self.async_eval_results_queue = multiprocessing.Queue()
-
-          # Reason to use a Process as opposed to Thread is mainly the
-          # computationally intensive eval runner. Python multithreading is not
-          # truly running in parallel, a runner thread would get significantly
-          # delayed (or alternatively delay the main thread).
-          self.async_eval_process = multiprocessing.Process(
-              target=coco_metric.async_eval_runner,
-              args=(self.async_eval_predictions_queue,
-                    self.async_eval_results_queue,
-                    annotation_file))
-          self.async_eval_process.daemon = True
-          self.async_eval_process.start()
-
-          self.async_eval_results_getter_thread = threading.Thread(
-              target=_eval_results_getter, args=())
-          self.async_eval_results_getter_thread.daemon = True
-          self.async_eval_results_getter_thread.start()
-
-        self.async_eval_predictions_queue.put(
-            (self.eval_global_step, decoded_preds))
-        return {'top_1_accuracy': 0, 'top_5_accuracy': 0.}
-
-      eval_results = coco_metric.compute_map(decoded_preds, annotation_file)
-      self.eval_coco_ap = eval_results['COCO/AP']
-      ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
-      for metric_key, metric_value in eval_results.items():
-        ret[constants.SIMPLE_VALUE_RESULT_PREFIX + metric_key] = metric_value
-      mlperf.logger.log_eval_accuracy(self.eval_coco_ap, self.eval_global_step,
-                                      self.batch_size * self.params.num_gpus,
-                                      ssd_constants.COCO_NUM_TRAIN_IMAGES)
-      return ret
-    log_fn('Got {:d} out of {:d} eval examples.'
-           ' Waiting for the remaining to calculate mAP...'.format(
-               len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES))
-    return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
-
-  def get_synthetic_inputs(self, input_name, nclass):
-    """Generating synthetic data matching real data shape and type."""
-    inputs = tf.random_uniform(
-        self.get_input_shapes('train')[0], dtype=self.data_type)
-    inputs = variables.VariableV1(inputs, trainable=False,
-                                  collections=[tf.GraphKeys.LOCAL_VARIABLES],
-                                  name=input_name)
-    boxes = tf.random_uniform(
-        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 4], dtype=tf.float32)
-    classes = tf.random_uniform(
-        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 1], dtype=tf.float32)
-    nboxes = tf.random_uniform(
-        [self.batch_size], minval=1, maxval=10, dtype=tf.float32)
-    return (inputs, boxes, classes, nboxes)
-
-  def reached_target(self):
-    return (self.params.stop_at_top_1_accuracy and
-            self.eval_coco_ap >= self.params.stop_at_top_1_accuracy)
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/platforms/__pycache__/__init__.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/platforms/__pycache__/__init__.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/platforms/__pycache__/util.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/platforms/__pycache__/util.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/platforms/default/__pycache__/__init__.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/platforms/default/__pycache__/__init__.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/platforms/default/__pycache__/util.cpython-36.pyc
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/platforms/default/__pycache__/util.cpython-36.pyc
--- a/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/preprocessing.py
+++ b/TensorFlow/ComputeVision/Classification/benchmark/scripts/tf_cnn_benchmarks/preprocessing.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Image pre-processing utilities.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow.compat.v1 as tf
-
-# pylint: disable=g-direct-tensorflow-import
-import cnn_util
-from tensorflow.python.data.experimental.ops import threadpool
-from tensorflow.python.data.ops import multi_device_iterator_ops
-from tensorflow.python.framework import function
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.platform import gfile
-import mlperf
-
-
-def parse_example_proto(example_serialized):
-  """Parses an Example proto containing a training example of an image.
-
-  The output of the build_image_data.py image preprocessing script is a dataset
-  containing serialized Example protocol buffers. Each Example proto contains
-  the following fields:
-
-    image/height: 462
-    image/width: 581
-    image/colorspace: 'RGB'
-    image/channels: 3
-    image/class/label: 615
-    image/class/synset: 'n03623198'
-    image/class/text: 'knee pad'
-    image/object/bbox/xmin: 0.1
-    image/object/bbox/xmax: 0.9
-    image/object/bbox/ymin: 0.2
-    image/object/bbox/ymax: 0.6
-    image/object/bbox/label: 615
-    image/format: 'JPEG'
-    image/filename: 'ILSVRC2012_val_00041207.JPEG'
-    image/encoded: <JPEG encoded string>
-
-  Args:
-    example_serialized: scalar Tensor tf.string containing a serialized
-      Example protocol buffer.
-
-  Returns:
-    image_buffer: Tensor tf.string containing the contents of a JPEG file.
-    label: Tensor tf.int32 containing the label.
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged as
-      [ymin, xmin, ymax, xmax].
-    text: Tensor tf.string containing the human-readable label.
-  """
-  # Dense features in Example proto.
-  feature_map = {
-      'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
-                                          default_value=''),
-      'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64,
-                                              default_value=-1),
-      'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
-                                             default_value=''),
-  }
-  sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
-  # Sparse features in Example proto.
-  feature_map.update(
-      {k: sparse_float32 for k in ['image/object/bbox/xmin',
-                                   'image/object/bbox/ymin',
-                                   'image/object/bbox/xmax',
-                                   'image/object/bbox/ymax']})
-
-  features = tf.parse_single_example(example_serialized, feature_map)
-  label = tf.cast(features['image/class/label'], dtype=tf.int32)
-
-  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
-  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
-  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
-  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
-
-  # Note that we impose an ordering of (y, x) just to make life difficult.
-  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
-
-  # Force the variable number of bounding boxes into the shape
-  # [1, num_boxes, coords].
-  bbox = tf.expand_dims(bbox, 0)
-  bbox = tf.transpose(bbox, [0, 2, 1])
-
-  return features['image/encoded'], label, bbox, features['image/class/text']
-
-
-_RESIZE_METHOD_MAP = {
-    'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR,
-    'bilinear': tf.image.ResizeMethod.BILINEAR,
-    'bicubic': tf.image.ResizeMethod.BICUBIC,
-    'area': tf.image.ResizeMethod.AREA
-}
-
-
-def get_image_resize_method(resize_method, batch_position=0):
-  """Get tensorflow resize method.
-
-  If resize_method is 'round_robin', return different methods based on batch
-  position in a round-robin fashion. NOTE: If the batch size is not a multiple
-  of the number of methods, then the distribution of methods will not be
-  uniform.
-
-  Args:
-    resize_method: (string) nearest, bilinear, bicubic, area, or round_robin.
-    batch_position: position of the image in a batch. NOTE: this argument can
-      be an integer or a tensor
-  Returns:
-    one of resize type defined in tf.image.ResizeMethod.
-  """
-
-  if resize_method != 'round_robin':
-    return _RESIZE_METHOD_MAP[resize_method]
-
-  # return a resize method based on batch position in a round-robin fashion.
-  resize_methods = list(_RESIZE_METHOD_MAP.values())
-  def lookup(index):
-    return resize_methods[index]
-
-  def resize_method_0():
-    return utils.smart_cond(batch_position % len(resize_methods) == 0,
-                            lambda: lookup(0), resize_method_1)
-
-  def resize_method_1():
-    return utils.smart_cond(batch_position % len(resize_methods) == 1,
-                            lambda: lookup(1), resize_method_2)
-
-  def resize_method_2():
-    return utils.smart_cond(batch_position % len(resize_methods) == 2,
-                            lambda: lookup(2), lambda: lookup(3))
-
-  # NOTE(jsimsa): Unfortunately, we cannot use a single recursive function here
-  # because TF would not be able to construct a finite graph.
-
-  return resize_method_0()
-
-
-def decode_jpeg(image_buffer, scope=None):  # , dtype=tf.float32):
-  """Decode a JPEG string into one 3-D float image Tensor.
-
-  Args:
-    image_buffer: scalar string Tensor.
-    scope: Optional scope for op_scope.
-  Returns:
-    3-D float Tensor with values ranging from [0, 1).
-  """
-  # with tf.op_scope([image_buffer], scope, 'decode_jpeg'):
-  # with tf.name_scope(scope, 'decode_jpeg', [image_buffer]):
-  with tf.name_scope(scope or 'decode_jpeg'):
-    # Decode the string as an RGB JPEG.
-    # Note that the resulting image contains an unknown height and width
-    # that is set dynamically by decode_jpeg. In other words, the height
-    # and width of image is unknown at compile-time.
-    image = tf.image.decode_jpeg(image_buffer, channels=3,
-                                 fancy_upscaling=False,
-                                 dct_method='INTEGER_FAST')
-
-    # image = tf.Print(image, [tf.shape(image)], 'Image shape: ')
-
-    return image
-
-
-_R_MEAN = 123.68
-_G_MEAN = 116.78
-_B_MEAN = 103.94
-_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
-
-
-def normalized_image(images):
-  # Rescale from [0, 255] to [0, 2]
-  images = tf.multiply(images, 1. / 127.5)
-  # Rescale to [-1, 1]
-  mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION, value=[1.0] * 3)
-  return tf.subtract(images, 1.0)
-
-
-def eval_image(image,
-               height,
-               width,
-               batch_position,
-               resize_method,
-               summary_verbosity=0):
-  """Get the image for model evaluation.
-
-  We preprocess the image simiarly to Slim, see
-  https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/vgg_preprocessing.py
-  Validation images do not have bounding boxes, so to crop the image, we first
-  resize the image such that the aspect ratio is maintained and the resized
-  height and width are both at least 1.145 times `height` and `width`
-  respectively. Then, we do a central crop to size (`height`, `width`).
-
-  Args:
-    image: 3-D float Tensor representing the image.
-    height: The height of the image that will be returned.
-    width: The width of the image that will be returned.
-    batch_position: position of the image in a batch, which affects how images
-      are distorted and resized. NOTE: this argument can be an integer or a
-      tensor
-    resize_method: one of the strings 'round_robin', 'nearest', 'bilinear',
-      'bicubic', or 'area'.
-    summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both
-      summaries and checkpoints.
-  Returns:
-    An image of size (output_height, output_width, 3) that is resized and
-    cropped as described above.
-  """
-  # TODO(reedwm): Currently we resize then crop. Investigate if it's faster to
-  # crop then resize.
-  with tf.name_scope('eval_image'):
-    if summary_verbosity >= 3:
-      tf.summary.image(
-          'original_image', tf.expand_dims(image, 0))
-
-    shape = tf.shape(image)
-    image_height = shape[0]
-    image_width = shape[1]
-    image_height_float = tf.cast(image_height, tf.float32)
-    image_width_float = tf.cast(image_width, tf.float32)
-
-    # This value is chosen so that in resnet, images are cropped to a size of
-    # 256 x 256, which matches what other implementations do. The final image
-    # size for resnet is 224 x 224, and floor(224 * 1.145) = 256.
-    scale_factor = 1.145
-
-    # Compute resize_height and resize_width to be the minimum values such that
-    #   1. The aspect ratio is maintained (i.e. resize_height / resize_width is
-    #      image_height / image_width), and
-    #   2. resize_height >= height * `scale_factor`, and
-    #   3. resize_width >= width * `scale_factor`
-    max_ratio = tf.maximum(height / image_height_float,
-                           width / image_width_float)
-    resize_height = tf.cast(image_height_float * max_ratio * scale_factor,
-                            tf.int32)
-    resize_width = tf.cast(image_width_float * max_ratio * scale_factor,
-                           tf.int32)
-    mlperf.logger.log_input_resize_aspect_preserving(height, width,
-                                                     scale_factor)
-
-    # Resize the image to shape (`resize_height`, `resize_width`)
-    image_resize_method = get_image_resize_method(resize_method, batch_position)
-    distorted_image = tf.image.resize_images(image,
-                                             [resize_height, resize_width],
-                                             image_resize_method,
-                                             align_corners=False)
-
-    # Do a central crop of the image to size (height, width).
-    # MLPerf requires us to log (height, width) with two different keys.
-    mlperf.logger.log(key=mlperf.tags.INPUT_CENTRAL_CROP, value=[height, width])
-    mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width])
-    total_crop_height = (resize_height - height)
-    crop_top = total_crop_height // 2
-    total_crop_width = (resize_width - width)
-    crop_left = total_crop_width // 2
-    distorted_image = tf.slice(distorted_image, [crop_top, crop_left, 0],
-                               [height, width, 3])
-
-    distorted_image.set_shape([height, width, 3])
-    if summary_verbosity >= 3:
-      tf.summary.image(
-          'cropped_resized_image', tf.expand_dims(distorted_image, 0))
-    image = distorted_image
-  return image
-
-
-def train_image(image_buffer,
-                height,
-                width,
-                bbox,
-                batch_position,
-                resize_method,
-                distortions,
-                scope=None,
-                summary_verbosity=0,
-                distort_color_in_yiq=False,
-                fuse_decode_and_crop=False):
-  """Distort one image for training a network.
-
-  Distorting images provides a useful technique for augmenting the data
-  set during training in order to make the network invariant to aspects
-  of the image that do not effect the label.
-
-  Args:
-    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
-    height: integer
-    width: integer
-    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
-      where each coordinate is [0, 1) and the coordinates are arranged
-      as [ymin, xmin, ymax, xmax].
-    batch_position: position of the image in a batch, which affects how images
-      are distorted and resized. NOTE: this argument can be an integer or a
-      tensor
-    resize_method: round_robin, nearest, bilinear, bicubic, or area.
-    distortions: If true, apply full distortions for image colors.
-    scope: Optional scope for op_scope.
-    summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both
-      summaries and checkpoints.
-    distort_color_in_yiq: distort color of input images in YIQ space.
-    fuse_decode_and_crop: fuse the decode/crop operation.
-  Returns:
-    3-D float Tensor of distorted image used for training.
-  """
-  # with tf.op_scope([image, height, width, bbox], scope, 'distort_image'):
-  # with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]):
-  with tf.name_scope(scope or 'distort_image'):
-    # A large fraction of image datasets contain a human-annotated bounding box
-    # delineating the region of the image containing the object of interest.  We
-    # choose to create a new bounding box for the object which is a randomly
-    # distorted version of the human-annotated bounding box that obeys an
-    # allowed range of aspect ratios, sizes and overlap with the human-annotated
-    # bounding box. If no box is supplied, then we assume the bounding box is
-    # the entire image.
-    min_object_covered = 0.1
-    aspect_ratio_range = [0.75, 1.33]
-    area_range = [0.05, 1.0]
-    max_attempts = 100
-    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MIN_OBJ_COV,
-                      value=min_object_covered)
-    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_RATIO_RANGE,
-                      value=aspect_ratio_range)
-    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_AREA_RANGE,
-                      value=area_range)
-    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MAX_ATTEMPTS,
-                      value=max_attempts)
-
-    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
-        tf.image.extract_jpeg_shape(image_buffer),
-        bounding_boxes=bbox,
-        min_object_covered=min_object_covered,
-        aspect_ratio_range=aspect_ratio_range,
-        area_range=area_range,
-        max_attempts=max_attempts,
-        use_image_if_no_bounding_boxes=True)
-    bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
-    if summary_verbosity >= 3:
-      image = tf.image.decode_jpeg(image_buffer, channels=3,
-                                   dct_method='INTEGER_FAST')
-      image = tf.image.convert_image_dtype(image, dtype=tf.float32)
-      image_with_distorted_box = tf.image.draw_bounding_boxes(
-          tf.expand_dims(image, 0), distort_bbox)
-      tf.summary.image(
-          'images_with_distorted_bounding_box',
-          image_with_distorted_box)
-
-    # Crop the image to the specified bounding box.
-    if fuse_decode_and_crop:
-      offset_y, offset_x, _ = tf.unstack(bbox_begin)
-      target_height, target_width, _ = tf.unstack(bbox_size)
-      crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
-      image = tf.image.decode_and_crop_jpeg(
-          image_buffer, crop_window, channels=3)
-    else:
-      image = tf.image.decode_jpeg(image_buffer, channels=3,
-                                   dct_method='INTEGER_FAST')
-      image = tf.slice(image, bbox_begin, bbox_size)
-
-    mlperf.logger.log(key=mlperf.tags.INPUT_RANDOM_FLIP)
-    distorted_image = tf.image.random_flip_left_right(image)
-
-    # This resizing operation may distort the images because the aspect
-    # ratio is not respected.
-    mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width])
-    image_resize_method = get_image_resize_method(resize_method, batch_position)
-    distorted_image = tf.image.resize_images(
-        distorted_image, [height, width],
-        image_resize_method,
-        align_corners=False)
-    # Restore the shape since the dynamic slice based upon the bbox_size loses
-    # the third dimension.
-    distorted_image.set_shape([height, width, 3])
-    if summary_verbosity >= 3:
-      tf.summary.image('cropped_resized_maybe_flipped_image',
-                       tf.expand_dims(distorted_image, 0))
-
-    if distortions:
-      distorted_image = tf.cast(distorted_image, dtype=tf.float32)
-      # Images values are expected to be in [0,1] for color distortion.
-      distorted_image /= 255.
-      # Randomly distort the colors.
-      distorted_image = distort_color(distorted_image, batch_position,
-                                      distort_color_in_yiq=distort_color_in_yiq)
-
-      # Note: This ensures the scaling matches the output of eval_image
-      distorted_image *= 255
-
-    if summary_verbosity >= 3:
-      tf.summary.image(
-          'final_distorted_image',
-          tf.expand_dims(distorted_image, 0))
-    return distorted_image
-
-
-def distort_color(image, batch_position=0, distort_color_in_yiq=False,
-                  scope=None):
-  """Distort the color of the image.
-
-  Each color distortion is non-commutative and thus ordering of the color ops
-  matters. Ideally we would randomly permute the ordering of the color ops.
-  Rather then adding that level of complication, we select a distinct ordering
-  of color ops based on the position of the image in a batch.
-
-  Args:
-    image: float32 Tensor containing single image. Tensor values should be in
-      range [0, 1].
-    batch_position: the position of the image in a batch. NOTE: this argument
-      can be an integer or a tensor
-    distort_color_in_yiq: distort color of input images in YIQ space.
-    scope: Optional scope for op_scope.
-  Returns:
-    color-distorted image
-  """
-  if distort_color_in_yiq:
-    try:
-      from tensorflow.contrib.image.python.ops import distort_image_ops  # pylint: disable=g-import-not-at-top
-    except ImportError:
-      raise ValueError(
-          'In TF2, you cannot pass --distortions unless you also pass '
-          '--nodistort_color_in_yiq. This is because the random_hsv_in_yiq was '
-          'removed in TF2. --distortions does not improve accuracy on resnet '
-          'so it is not recommended. --nodistort_color_in_yiq also has no '
-          'impact on accuracy, but may hurt performance.')
-
-  with tf.name_scope(scope or 'distort_color'):
-
-    def distort_fn_0(image=image):
-      """Variant 0 of distort function."""
-      image = tf.image.random_brightness(image, max_delta=32. / 255.)
-      if distort_color_in_yiq:
-        image = distort_image_ops.random_hsv_in_yiq(
-            image, lower_saturation=0.5, upper_saturation=1.5,
-            max_delta_hue=0.2 * math.pi)
-      else:
-        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
-        image = tf.image.random_hue(image, max_delta=0.2)
-      image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
-      return image
-
-    def distort_fn_1(image=image):
-      """Variant 1 of distort function."""
-      image = tf.image.random_brightness(image, max_delta=32. / 255.)
-      image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
-      if distort_color_in_yiq:
-        image = distort_image_ops.random_hsv_in_yiq(
-            image, lower_saturation=0.5, upper_saturation=1.5,
-            max_delta_hue=0.2 * math.pi)
-      else:
-        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
-        image = tf.image.random_hue(image, max_delta=0.2)
-      return image
-
-    image = utils.smart_cond(batch_position % 2 == 0, distort_fn_0,
-                             distort_fn_1)
-    # The random_* ops do not necessarily clamp.
-    image = tf.clip_by_value(image, 0.0, 1.0)
-    return image
-
-
-class InputPreprocessor(object):
-  """Base class for all model preprocessors."""
-
-  def __init__(self, batch_size, output_shapes):
-    self.batch_size = batch_size
-    self.output_shapes = output_shapes
-
-  def supports_datasets(self):
-    """Whether this preprocessor supports dataset."""
-    return False
-
-  def minibatch(self, dataset, subset, params, shift_ratio=-1):
-    """Returns tensors representing a minibatch of all the input."""
-    raise NotImplementedError('Must be implemented by subclass.')
-
-  # The methods added below are only supported/used if supports_datasets()
-  # returns True.
-  # TODO(laigd): refactor benchmark_cnn.py and put the logic of
-  # _build_input_processing() into InputPreprocessor.
-
-  def parse_and_preprocess(self, value, batch_position):
-    """Function to parse and preprocess an Example proto in input pipeline."""
-    raise NotImplementedError('Must be implemented by subclass.')
-
-  # TODO(laigd): figure out how to remove these parameters, since the
-  # preprocessor itself has self.batch_size, self.num_splits, etc defined.
-  def build_multi_device_iterator(self, batch_size, num_splits, cpu_device,
-                                  params, gpu_devices, dataset, doing_eval):
-    """Creates a MultiDeviceIterator."""
-    assert self.supports_datasets()
-    assert num_splits == len(gpu_devices)
-    with tf.name_scope('batch_processing'):
-      if doing_eval:
-        subset = 'validation'
-      else:
-        subset = 'train'
-      batch_size_per_split = batch_size // num_splits
-      ds = self.create_dataset(
-          batch_size,
-          num_splits,
-          batch_size_per_split,
-          dataset,
-          subset,
-          train=(not doing_eval),
-          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
-          num_threads=params.datasets_num_private_threads,
-          datasets_use_caching=params.datasets_use_caching,
-          datasets_parallel_interleave_cycle_length=(
-              params.datasets_parallel_interleave_cycle_length),
-          datasets_sloppy_parallel_interleave=(
-              params.datasets_sloppy_parallel_interleave),
-          datasets_parallel_interleave_prefetch=(
-              params.datasets_parallel_interleave_prefetch))
-      multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          ds,
-          gpu_devices,
-          source_device=cpu_device,
-          max_buffer_size=params.multi_device_iterator_max_buffer_size)
-      tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
-                           multi_device_iterator.initializer)
-      return multi_device_iterator
-
-  def create_dataset(self,
-                     batch_size,
-                     num_splits,
-                     batch_size_per_split,
-                     dataset,
-                     subset,
-                     train,
-                     datasets_repeat_cached_sample,
-                     num_threads=None,
-                     datasets_use_caching=False,
-                     datasets_parallel_interleave_cycle_length=None,
-                     datasets_sloppy_parallel_interleave=False,
-                     datasets_parallel_interleave_prefetch=None):
-    """Creates a dataset for the benchmark."""
-    raise NotImplementedError('Must be implemented by subclass.')
-
-  def create_iterator(self, ds):
-    ds_iterator = tf.data.make_initializable_iterator(ds)
-    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
-                         ds_iterator.initializer)
-    return ds_iterator
-
-  def minibatch_fn(self, batch_size, model_input_shapes, num_splits,
-                   dataset, subset, train, datasets_repeat_cached_sample,
-                   num_threads, datasets_use_caching,
-                   datasets_parallel_interleave_cycle_length,
-                   datasets_sloppy_parallel_interleave,
-                   datasets_parallel_interleave_prefetch):
-    """Returns a function and list of args for the fn to create a minibatch."""
-    assert self.supports_datasets()
-    batch_size_per_split = batch_size // num_splits
-    assert batch_size_per_split == model_input_shapes[0][0]
-    with tf.name_scope('batch_processing'):
-      ds = self.create_dataset(batch_size, num_splits, batch_size_per_split,
-                               dataset, subset, train,
-                               datasets_repeat_cached_sample, num_threads,
-                               datasets_use_caching,
-                               datasets_parallel_interleave_cycle_length,
-                               datasets_sloppy_parallel_interleave,
-                               datasets_parallel_interleave_prefetch)
-      ds_iterator = self.create_iterator(ds)
-
-      ds_iterator_string_handle = ds_iterator.string_handle()
-
-      @function.Defun(tf.string)
-      def _fn(h):
-        remote_iterator = tf.data.Iterator.from_string_handle(
-            h, ds_iterator.output_types, ds_iterator.output_shapes)
-        input_list = remote_iterator.get_next()
-        reshaped_input_list = [
-            tf.reshape(input_list[i], shape=model_input_shapes[i])
-            for i in range(len(input_list))
-        ]
-        return reshaped_input_list
-
-      return _fn, [ds_iterator_string_handle]
-
-
-class BaseImagePreprocessor(InputPreprocessor):
-  """Base class for all image model preprocessors."""
-
-  def __init__(self,
-               batch_size,
-               output_shapes,
-               num_splits,
-               dtype,
-               train,
-               distortions,
-               resize_method,
-               shift_ratio=-1,
-               summary_verbosity=0,
-               distort_color_in_yiq=True,
-               fuse_decode_and_crop=True,
-               match_mlperf=False):
-    super(BaseImagePreprocessor, self).__init__(batch_size, output_shapes)
-    image_shape = output_shapes[0]
-    # image_shape is in form (batch_size, height, width, depth)
-    self.height = image_shape[1]
-    self.width = image_shape[2]
-    self.depth = image_shape[3]
-    self.num_splits = num_splits
-    self.dtype = dtype
-    self.train = train
-    self.resize_method = resize_method
-    self.shift_ratio = shift_ratio
-    self.distortions = distortions
-    self.distort_color_in_yiq = distort_color_in_yiq
-    self.fuse_decode_and_crop = fuse_decode_and_crop
-    if self.batch_size % self.num_splits != 0:
-      raise ValueError(
-          ('batch_size must be a multiple of num_splits: '
-           'batch_size %d, num_splits: %d') %
-          (self.batch_size, self.num_splits))
-    self.batch_size_per_split = self.batch_size // self.num_splits
-    self.summary_verbosity = summary_verbosity
-    self.match_mlperf = match_mlperf
-
-  def parse_and_preprocess(self, value, batch_position):
-    assert self.supports_datasets()
-    image_buffer, label_index, bbox, _ = parse_example_proto(value)
-    if self.match_mlperf:
-      bbox = tf.zeros((1, 0, 4), dtype=bbox.dtype)
-      mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=False)
-    else:
-      mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=True)
-    image = self.preprocess(image_buffer, bbox, batch_position)
-    return (image, label_index)
-
-  def preprocess(self, image_buffer, bbox, batch_position):
-    raise NotImplementedError('Must be implemented by subclass.')
-
-  def create_dataset(self,
-                     batch_size,
-                     num_splits,
-                     batch_size_per_split,
-                     dataset,
-                     subset,
-                     train,
-                     datasets_repeat_cached_sample,
-                     num_threads=None,
-                     datasets_use_caching=False,
-                     datasets_parallel_interleave_cycle_length=None,
-                     datasets_sloppy_parallel_interleave=False,
-                     datasets_parallel_interleave_prefetch=None):
-    """Creates a dataset for the benchmark."""
-    assert self.supports_datasets()
-    glob_pattern = dataset.tf_record_pattern(subset)
-    file_names = gfile.Glob(glob_pattern)
-    if not file_names:
-      raise ValueError('Found no files in --data_dir matching: {}'
-                       .format(glob_pattern))
-    ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train)
-    ds = ds.apply(
-        tf.data.experimental.parallel_interleave(
-            tf.data.TFRecordDataset,
-            cycle_length=datasets_parallel_interleave_cycle_length or 10,
-            sloppy=datasets_sloppy_parallel_interleave,
-            prefetch_input_elements=datasets_parallel_interleave_prefetch))
-    if datasets_repeat_cached_sample:
-      # Repeat a single sample element indefinitely to emulate memory-speed IO.
-      ds = ds.take(1).cache().repeat()
-    counter = tf.data.Dataset.range(batch_size)
-    counter = counter.repeat()
-    ds = tf.data.Dataset.zip((ds, counter))
-    ds = ds.prefetch(buffer_size=batch_size)
-    if datasets_use_caching:
-      ds = ds.cache()
-    if train:
-      buffer_size = 10000
-      mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=buffer_size)
-      ds = ds.apply(
-          tf.data.experimental.shuffle_and_repeat(buffer_size=buffer_size))
-    else:
-      ds = ds.repeat()
-    ds = ds.apply(
-        tf.data.experimental.map_and_batch(
-            map_func=self.parse_and_preprocess,
-            batch_size=batch_size_per_split,
-            num_parallel_batches=num_splits))
-    ds = ds.prefetch(buffer_size=num_splits)
-    if num_threads:
-      ds = threadpool.override_threadpool(
-          ds,
-          threadpool.PrivateThreadPool(
-              num_threads, display_name='input_pipeline_thread_pool'))
-    return ds
-
-
-class RecordInputImagePreprocessor(BaseImagePreprocessor):
-  """Preprocessor for images with RecordInput format."""
-
-  def preprocess(self, image_buffer, bbox, batch_position):
-    """Preprocessing image_buffer as a function of its batch position."""
-    if self.train:
-      image = train_image(image_buffer, self.height, self.width, bbox,
-                          batch_position, self.resize_method, self.distortions,
-                          None, summary_verbosity=self.summary_verbosity,
-                          distort_color_in_yiq=self.distort_color_in_yiq,
-                          fuse_decode_and_crop=self.fuse_decode_and_crop)
-    else:
-      image = tf.image.decode_jpeg(
-          image_buffer, channels=3, dct_method='INTEGER_FAST')
-      image = eval_image(image, self.height, self.width, batch_position,
-                         self.resize_method,
-                         summary_verbosity=self.summary_verbosity)
-    # Note: image is now float32 [height,width,3] with range [0, 255]
-
-    # image = tf.cast(image, tf.uint8) # HACK TESTING
-
-    if self.match_mlperf:
-      mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION,
-                        value=_CHANNEL_MEANS)
-      normalized = image - _CHANNEL_MEANS
-    else:
-      normalized = normalized_image(image)
-    return tf.cast(normalized, self.dtype)
-
-  def minibatch(self,
-                dataset,
-                subset,
-                params,
-                shift_ratio=-1):
-    if shift_ratio < 0:
-      shift_ratio = self.shift_ratio
-    with tf.name_scope('batch_processing'):
-      # Build final results per split.
-      images = [[] for _ in range(self.num_splits)]
-      labels = [[] for _ in range(self.num_splits)]
-      if params.use_datasets:
-        ds = self.create_dataset(
-            self.batch_size, self.num_splits, self.batch_size_per_split,
-            dataset, subset, self.train,
-            datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
-            num_threads=params.datasets_num_private_threads,
-            datasets_use_caching=params.datasets_use_caching,
-            datasets_parallel_interleave_cycle_length=(
-                params.datasets_parallel_interleave_cycle_length),
-            datasets_sloppy_parallel_interleave=(
-                params.datasets_sloppy_parallel_interleave),
-            datasets_parallel_interleave_prefetch=(
-                params.datasets_parallel_interleave_prefetch))
-        ds_iterator = self.create_iterator(ds)
-        for d in xrange(self.num_splits):
-          images[d], labels[d] = ds_iterator.get_next()
-
-      # TODO(laigd): consider removing the --use_datasets option, it should
-      # always use datasets.
-      else:
-        record_input = data_flow_ops.RecordInput(
-            file_pattern=dataset.tf_record_pattern(subset),
-            seed=301,
-            parallelism=64,
-            buffer_size=10000,
-            batch_size=self.batch_size,
-            shift_ratio=shift_ratio,
-            name='record_input')
-        records = record_input.get_yield_op()
-        records = tf.split(records, self.batch_size, 0)
-        records = [tf.reshape(record, []) for record in records]
-        for idx in xrange(self.batch_size):
-          value = records[idx]
-          (image, label) = self.parse_and_preprocess(value, idx)
-          split_index = idx % self.num_splits
-          labels[split_index].append(label)
-          images[split_index].append(image)
-
-      for split_index in xrange(self.num_splits):
-        if not params.use_datasets:
-          images[split_index] = tf.parallel_stack(images[split_index])
-          labels[split_index] = tf.concat(labels[split_index], 0)
-        images[split_index] = tf.reshape(
-            images[split_index],
-            shape=[self.batch_size_per_split, self.height, self.width,
-                   self.depth])
-        labels[split_index] = tf.reshape(labels[split_index],
-                                         [self.batch_size_per_split])
-      return images, labels
-
-  def supports_datasets(self):
-    return True
-
-
-class ImagenetPreprocessor(RecordInputImagePreprocessor):
-
-  def preprocess(self, image_buffer, bbox, batch_position):
-    # pylint: disable=g-import-not-at-top
-    try:
-      from official.r1.resnet.imagenet_preprocessing import preprocess_image
-    except ImportError:
-      tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH.')
-      raise
-    if self.train:
-      image = preprocess_image(
-          image_buffer, bbox, self.height, self.width, self.depth,
-          is_training=True)
-    else:
-      image = preprocess_image(
-          image_buffer, bbox, self.height, self.width, self.depth,
-          is_training=False)
-    return tf.cast(image, self.dtype)
-
-
-class Cifar10ImagePreprocessor(BaseImagePreprocessor):
-  """Preprocessor for Cifar10 input images."""
-
-  def _distort_image(self, image):
-    """Distort one image for training a network.
-
-    Adopted the standard data augmentation scheme that is widely used for
-    this dataset: the images are first zero-padded with 4 pixels on each side,
-    then randomly cropped to again produce distorted images; half of the images
-    are then horizontally mirrored.
-
-    Args:
-      image: input image.
-    Returns:
-      distorted image.
-    """
-    image = tf.image.resize_image_with_crop_or_pad(
-        image, self.height + 8, self.width + 8)
-    distorted_image = tf.random_crop(image,
-                                     [self.height, self.width, self.depth])
-    # Randomly flip the image horizontally.
-    distorted_image = tf.image.random_flip_left_right(distorted_image)
-    if self.summary_verbosity >= 3:
-      tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0))
-    return distorted_image
-
-  def _eval_image(self, image):
-    """Get the image for model evaluation."""
-    distorted_image = tf.image.resize_image_with_crop_or_pad(
-        image, self.width, self.height)
-    if self.summary_verbosity >= 3:
-      tf.summary.image('cropped.image', tf.expand_dims(distorted_image, 0))
-    return distorted_image
-
-  def preprocess(self, raw_image):
-    """Preprocessing raw image."""
-    if self.summary_verbosity >= 3:
-      tf.summary.image('raw.image', tf.expand_dims(raw_image, 0))
-    if self.train and self.distortions:
-      image = self._distort_image(raw_image)
-    else:
-      image = self._eval_image(raw_image)
-    normalized = normalized_image(image)
-    return tf.cast(normalized, self.dtype)
-
-  def minibatch(self,
-                dataset,
-                subset,
-                params,
-                shift_ratio=-1):
-    # TODO(jsimsa): Implement datasets code path
-    del shift_ratio, params
-    with tf.name_scope('batch_processing'):
-      all_images, all_labels = dataset.read_data_files(subset)
-      all_images = tf.constant(all_images)
-      all_labels = tf.constant(all_labels)
-      input_image, input_label = tf.train.slice_input_producer(
-          [all_images, all_labels])
-      input_image = tf.cast(input_image, self.dtype)
-      input_label = tf.cast(input_label, tf.int32)
-      # Ensure that the random shuffling has good mixing properties.
-      min_fraction_of_examples_in_queue = 0.4
-      min_queue_examples = int(dataset.num_examples_per_epoch(subset) *
-                               min_fraction_of_examples_in_queue)
-      raw_images, raw_labels = tf.train.shuffle_batch(
-          [input_image, input_label], batch_size=self.batch_size,
-          capacity=min_queue_examples + 3 * self.batch_size,
-          min_after_dequeue=min_queue_examples)
-
-      images = [[] for i in range(self.num_splits)]
-      labels = [[] for i in range(self.num_splits)]
-
-      # Create a list of size batch_size, each containing one image of the
-      # batch. Without the unstack call, raw_images[i] would still access the
-      # same image via a strided_slice op, but would be slower.
-      raw_images = tf.unstack(raw_images, axis=0)
-      raw_labels = tf.unstack(raw_labels, axis=0)
-      for i in xrange(self.batch_size):
-        split_index = i % self.num_splits
-        # The raw image read from data has the format [depth, height, width]
-        # reshape to the format returned by minibatch.
-        raw_image = tf.reshape(raw_images[i],
-                               [dataset.depth, dataset.height, dataset.width])
-        raw_image = tf.transpose(raw_image, [1, 2, 0])
-        image = self.preprocess(raw_image)
-        images[split_index].append(image)
-
-        labels[split_index].append(raw_labels[i])
-
-      for split_index in xrange(self.num_splits):
-        images[split_index] = tf.parallel_stack(images[split_index])
-        labels[split_index] = tf.parallel_stack(labels[split_index])
-      return images, labels
-
-
-class COCOPreprocessor(BaseImagePreprocessor):
-  """Preprocessor for COCO dataset input images, boxes, and labels."""
-
-  def minibatch(self,
-                dataset,
-                subset,
-                params,
-                shift_ratio=-1):
-    del shift_ratio  # Not used when using datasets instead of data_flow_ops
-    with tf.name_scope('batch_processing'):
-      ds = self.create_dataset(
-          self.batch_size, self.num_splits, self.batch_size_per_split,
-          dataset, subset, self.train, params.datasets_repeat_cached_sample)
-      ds_iterator = self.create_iterator(ds)
-
-      # Training data: 4 tuple
-      # Validation data: 5 tuple
-      # See get_input_shapes in models/ssd_model.py for details.
-      input_len = 4 if subset == 'train' else 5
-      input_lists = [[None for _ in range(self.num_splits)]
-                     for _ in range(input_len)]
-      for d in xrange(self.num_splits):
-        input_list = ds_iterator.get_next()
-        for i in range(input_len):
-          input_lists[i][d] = input_list[i]
-      return input_lists
-
-  def preprocess(self, data):
-    try:
-      import ssd_dataloader  # pylint: disable=g-import-not-at-top
-      import ssd_constants  # pylint: disable=g-import-not-at-top
-      from object_detection.core import preprocessor  # pylint: disable=g-import-not-at-top
-    except ImportError:
-      raise ImportError('To use the COCO dataset, you must clone the '
-                        'repo https://github.com/tensorflow/models and add '
-                        'tensorflow/models and tensorflow/models/research to '
-                        'the PYTHONPATH, and compile the protobufs by '
-                        'following https://github.com/tensorflow/models/blob/'
-                        'master/research/object_detection/g3doc/installation.md'
-                        '#protobuf-compilation')
-    image_buffer = data['image_buffer']
-    boxes = data['groundtruth_boxes']
-    classes = tf.reshape(data['groundtruth_classes'], [-1, 1])
-    source_id = tf.string_to_number(data['source_id'])
-    raw_shape = data['raw_shape']
-
-    ssd_encoder = ssd_dataloader.Encoder()
-
-    # Only 80 of the 90 COCO classes are used.
-    class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
-    classes = tf.gather(class_map, classes)
-    classes = tf.cast(classes, dtype=tf.float32)
-
-    if self.train:
-      image, boxes, classes = ssd_dataloader.ssd_decode_and_crop(
-          image_buffer, boxes, classes, raw_shape)
-      # ssd_crop resizes and returns image of dtype float32 and does not change
-      # its range (i.e., value in between 0--255). Divide by 255. converts it
-      # to [0, 1] range. Not doing this before cropping to avoid dtype cast
-      # (which incurs additional memory copy).
-      image /= 255.
-
-      image, boxes = preprocessor.random_horizontal_flip(
-          image=image, boxes=boxes)
-      # Random horizontal flip probability is 50%
-      # See https://github.com/tensorflow/models/blob/master/research/object_detection/core/preprocessor.py  # pylint: disable=line-too-long
-      mlperf.logger.log(key=mlperf.tags.RANDOM_FLIP_PROBABILITY, value=0.5)
-
-      image = tf.cast(image, self.dtype)
-
-      encoded_returns = ssd_encoder.encode_labels(boxes, classes)
-      encoded_classes, encoded_boxes, num_matched_boxes = encoded_returns
-
-      # Shape of image: [width, height, channel]
-      # Shape of encoded_boxes: [NUM_SSD_BOXES, 4]
-      # Shape of encoded_classes: [NUM_SSD_BOXES, 1]
-      # Shape of num_matched_boxes: [1]
-      return (image, encoded_boxes, encoded_classes, num_matched_boxes)
-
-    else:
-      image = tf.image.decode_jpeg(image_buffer)
-      image = tf.image.resize_images(
-          image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))
-      # resize_image returns image of dtype float32 and does not change its
-      # range. Divide by 255 to convert image to [0, 1] range.
-      image /= 255.
-
-      image = ssd_dataloader.normalize_image(image)
-      image = tf.cast(image, self.dtype)
-
-      def trim_and_pad(inp_tensor):
-        """Limit the number of boxes, and pad if necessary."""
-        inp_tensor = inp_tensor[:ssd_constants.MAX_NUM_EVAL_BOXES]
-        num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0]
-        inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
-        return tf.reshape(inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES,
-                                       inp_tensor.get_shape()[1]])
-
-      boxes, classes = trim_and_pad(boxes), trim_and_pad(classes)
-
-      # Shape of boxes: [MAX_NUM_EVAL_BOXES, 4]
-      # Shape of classes: [MAX_NUM_EVAL_BOXES, 1]
-      # Shape of source_id: [] (scalar tensor)
-      # Shape of raw_shape: [3]
-      return (image, boxes, classes, source_id, raw_shape)
-
-  def create_dataset(self,
-                     batch_size,
-                     num_splits,
-                     batch_size_per_split,
-                     dataset,
-                     subset,
-                     train,
-                     datasets_repeat_cached_sample,
-                     num_threads=None,
-                     datasets_use_caching=False,
-                     datasets_parallel_interleave_cycle_length=None,
-                     datasets_sloppy_parallel_interleave=False,
-                     datasets_parallel_interleave_prefetch=None):
-    """Creates a dataset for the benchmark."""
-    try:
-      import ssd_dataloader  # pylint: disable=g-import-not-at-top
-    except ImportError:
-      raise ImportError('To use the COCO dataset, you must clone the '
-                        'repo https://github.com/tensorflow/models and add '
-                        'tensorflow/models and tensorflow/models/research to '
-                        'the PYTHONPATH, and compile the protobufs by '
-                        'following https://github.com/tensorflow/models/blob/'
-                        'master/research/object_detection/g3doc/installation.md'
-                        '#protobuf-compilation')
-    assert self.supports_datasets()
-
-    glob_pattern = dataset.tf_record_pattern(subset)
-    ds = tf.data.TFRecordDataset.list_files(glob_pattern, shuffle=train)
-    # TODO(haoyuzhang): Enable map+filter fusion after cl/218399112 in release
-    # options = tf.data.Options()
-    # options.experimental_optimization = tf.data.experimental.OptimizationOptions()  # pylint: disable=line-too-long
-    # options.experimental_optimization.map_and_filter_fusion = True
-    # ds = ds.with_options(options)
-
-    ds = ds.apply(
-        tf.data.experimental.parallel_interleave(
-            tf.data.TFRecordDataset,
-            cycle_length=datasets_parallel_interleave_cycle_length or 10,
-            sloppy=datasets_sloppy_parallel_interleave))
-    mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
-    if datasets_repeat_cached_sample:
-      # Repeat a single sample element indefinitely to emulate memory-speed IO.
-      ds = ds.take(1).cache().repeat()
-    ds = ds.prefetch(buffer_size=batch_size)
-    if datasets_use_caching:
-      ds = ds.cache()
-    if train:
-      ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000))
-      mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=10000)
-      mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
-    else:
-      ds = ds.repeat()
-
-    ds = ds.map(ssd_dataloader.ssd_parse_example_proto, num_parallel_calls=64)
-    ds = ds.filter(
-        lambda data: tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0))
-    ds = ds.apply(
-        tf.data.experimental.map_and_batch(
-            map_func=self.preprocess,
-            batch_size=batch_size_per_split,
-            num_parallel_batches=num_splits,
-            drop_remainder=train))
-    ds = ds.prefetch(buffer_size=num_splits)
-    if num_threads:
-      ds = threadpool.override_threadpool(
-          ds,
-          threadpool.PrivateThreadPool(
-              num_threads, display_name='input_pipeline_thread_pool'))
-    return ds
-
-  def supports_datasets(self):
-    return True
-
-
-class TestImagePreprocessor(BaseImagePreprocessor):
-  """Preprocessor used for testing.
-
-  set_fake_data() sets which images and labels will be output by minibatch(),
-  and must be called before minibatch(). This allows tests to easily specify
-  a set of images to use for training, without having to create any files.
-
-  Queue runners must be started for this preprocessor to work.
-  """
-
-  def __init__(self,
-               batch_size,
-               output_shapes,
-               num_splits,
-               dtype,
-               train=None,
-               distortions=None,
-               resize_method=None,
-               shift_ratio=0,
-               summary_verbosity=0,
-               distort_color_in_yiq=False,
-               fuse_decode_and_crop=False,
-               match_mlperf=False):
-    super(TestImagePreprocessor, self).__init__(
-        batch_size, output_shapes, num_splits, dtype, train, distortions,
-        resize_method, shift_ratio, summary_verbosity=summary_verbosity,
-        distort_color_in_yiq=distort_color_in_yiq,
-        fuse_decode_and_crop=fuse_decode_and_crop, match_mlperf=match_mlperf)
-    self.expected_subset = None
-
-  def set_fake_data(self, fake_images, fake_labels):
-    assert len(fake_images.shape) == 4
-    assert len(fake_labels.shape) == 1
-    num_images = fake_images.shape[0]
-    assert num_images == fake_labels.shape[0]
-    assert num_images % self.batch_size == 0
-    self.fake_images = fake_images
-    self.fake_labels = fake_labels
-
-  def minibatch(self,
-                dataset,
-                subset,
-                params,
-                shift_ratio=0):
-    """Get test image batches."""
-    del dataset, params
-    if (not hasattr(self, 'fake_images') or
-        not hasattr(self, 'fake_labels')):
-      raise ValueError('Must call set_fake_data() before calling minibatch '
-                       'on TestImagePreprocessor')
-    if self.expected_subset is not None:
-      assert subset == self.expected_subset
-
-    shift_ratio = shift_ratio or self.shift_ratio
-    fake_images = cnn_util.roll_numpy_batches(self.fake_images, self.batch_size,
-                                              shift_ratio)
-    fake_labels = cnn_util.roll_numpy_batches(self.fake_labels, self.batch_size,
-                                              shift_ratio)
-
-    with tf.name_scope('batch_processing'):
-      image_slice, label_slice = tf.train.slice_input_producer(
-          [fake_images, fake_labels],
-          shuffle=False,
-          name='image_slice')
-      raw_images, raw_labels = tf.train.batch(
-          [image_slice, label_slice], batch_size=self.batch_size,
-          name='image_batch')
-      images = [[] for _ in range(self.num_splits)]
-      labels = [[] for _ in range(self.num_splits)]
-      for i in xrange(self.batch_size):
-        split_index = i % self.num_splits
-        raw_image = tf.cast(raw_images[i], self.dtype)
-        images[split_index].append(raw_image)
-        labels[split_index].append(raw_labels[i])
-      for split_index in xrange(self.num_splits):
-        images[split_index] = tf.parallel_stack(images[split_index])
-        labels[split_index] = tf.parallel_stack(labels[split_index])
-
-      normalized = [normalized_image(part) for part in images]
-      return [[tf.cast(part, self.dtype) for part in normalized], labels]
-
-
-class LibrispeechPreprocessor(InputPreprocessor):
-  """Preprocessor for librispeech class for all image model preprocessors."""
-
-  def __init__(self, batch_size, output_shapes, num_splits, dtype, train,
-               **kwargs):
-    del kwargs
-    super(LibrispeechPreprocessor, self).__init__(batch_size, output_shapes)
-    self.num_splits = num_splits
-    self.dtype = dtype
-    self.is_train = train
-    if self.batch_size % self.num_splits != 0:
-      raise ValueError(('batch_size must be a multiple of num_splits: '
-                        'batch_size %d, num_splits: %d') % (self.batch_size,
-                                                            self.num_splits))
-    self.batch_size_per_split = self.batch_size // self.num_splits
-
-  def create_dataset(self,
-                     batch_size,
-                     num_splits,
-                     batch_size_per_split,
-                     dataset,
-                     subset,
-                     train,
-                     datasets_repeat_cached_sample,
-                     num_threads=None,
-                     datasets_use_caching=False,
-                     datasets_parallel_interleave_cycle_length=None,
-                     datasets_sloppy_parallel_interleave=False,
-                     datasets_parallel_interleave_prefetch=None):
-    """Creates a dataset for the benchmark."""
-    # TODO(laigd): currently the only difference between this and the one in
-    # BaseImagePreprocessor is, this uses map() and padded_batch() while the
-    # latter uses tf.data.experimental.map_and_batch(). Try to merge them.
-    assert self.supports_datasets()
-    glob_pattern = dataset.tf_record_pattern(subset)
-    file_names = gfile.Glob(glob_pattern)
-    if not file_names:
-      raise ValueError('Found no files in --data_dir matching: {}'
-                       .format(glob_pattern))
-    ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train)
-    ds = ds.apply(
-        tf.data.experimental.parallel_interleave(
-            tf.data.TFRecordDataset,
-            cycle_length=datasets_parallel_interleave_cycle_length or 10,
-            sloppy=datasets_sloppy_parallel_interleave,
-            prefetch_input_elements=datasets_parallel_interleave_prefetch))
-    if datasets_repeat_cached_sample:
-      # Repeat a single sample element indefinitely to emulate memory-speed IO.
-      ds = ds.take(1).cache().repeat()
-    counter = tf.data.Dataset.range(batch_size)
-    counter = counter.repeat()
-    ds = tf.data.Dataset.zip((ds, counter))
-    ds = ds.prefetch(buffer_size=batch_size)
-    if datasets_use_caching:
-      ds = ds.cache()
-    if train:
-      ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000))
-    else:
-      ds = ds.repeat()
-    ds = ds.map(map_func=self.parse_and_preprocess,
-                num_parallel_calls=batch_size_per_split*num_splits)
-    ds = ds.padded_batch(
-        batch_size=batch_size_per_split,
-        padded_shapes=tuple([
-            tf.TensorShape(output_shape[1:])
-            for output_shape in self.output_shapes
-        ]),
-        drop_remainder=True)
-    ds = ds.prefetch(buffer_size=num_splits)
-    if num_threads:
-      ds = threadpool.override_threadpool(
-          ds,
-          threadpool.PrivateThreadPool(
-              num_threads, display_name='input_pipeline_thread_pool'))
-    return ds
-
-  def minibatch(self, dataset, subset, params, shift_ratio=-1):
-    assert params.use_datasets
-    # TODO(laigd): unify this with CNNModel's minibatch()
-    # TODO(laigd): in distributed mode we use shift_ratio so different workers
-    # won't work on same inputs, so we should respect that.
-    del shift_ratio
-    with tf.name_scope('batch_processing'):
-      ds = self.create_dataset(
-          self.batch_size,
-          self.num_splits,
-          self.batch_size_per_split,
-          dataset,
-          subset,
-          self.is_train,
-          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
-          num_threads=params.datasets_num_private_threads,
-          datasets_use_caching=params.datasets_use_caching,
-          datasets_parallel_interleave_cycle_length=(
-              params.datasets_parallel_interleave_cycle_length),
-          datasets_sloppy_parallel_interleave=(
-              params.datasets_sloppy_parallel_interleave),
-          datasets_parallel_interleave_prefetch=(
-              params.datasets_parallel_interleave_prefetch))
-      ds_iterator = self.create_iterator(ds)
-
-      # The four lists are: input spectrogram feature, labels, input lengths,
-      # label lengths
-      input_lists = [[None for _ in range(self.num_splits)] for _ in range(4)]
-      for d in xrange(self.num_splits):
-        input_list = ds_iterator.get_next()
-        for i in range(4):
-          input_lists[i][d] = input_list[i]
-
-      assert self.output_shapes == [
-          input_lists[i][0].shape.as_list() for i in range(4)
-      ]
-      return tuple(input_lists)
-
-  def supports_datasets(self):
-    return True
-
-  def parse_and_preprocess(self, value, batch_position):
-    """Parse an TFRecord."""
-    del batch_position
-    assert self.supports_datasets()
-    context_features = {
-        'labels': tf.VarLenFeature(dtype=tf.int64),
-        'input_length': tf.FixedLenFeature([], dtype=tf.int64),
-        'label_length': tf.FixedLenFeature([], dtype=tf.int64),
-    }
-    sequence_features = {
-        'features': tf.FixedLenSequenceFeature([161], dtype=tf.float32)
-    }
-    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
-        serialized=value,
-        context_features=context_features,
-        sequence_features=sequence_features,
-    )
-
-    return [
-        # Input
-        tf.expand_dims(sequence_parsed['features'], axis=2),
-        # Label
-        tf.cast(
-            tf.reshape(
-                tf.sparse_tensor_to_dense(context_parsed['labels']), [-1]),
-            dtype=tf.int32),
-        # Input length
-        tf.cast(
-            tf.reshape(context_parsed['input_length'], [1]),
-            dtype=tf.int32),
-        # Label length
-        tf.cast(
-            tf.reshape(context_parsed['label_length'], [1]),
-            dtype=tf.int32),
-    ]
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/.gitignore
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*.sw[op]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+# PyCharm
+.idea/
+
+# For mac
+.DS_Store
--- a/TensorFlow/ComputeVision/Classification/benchmark/LICENSE
+++ b/TensorFlow/ComputeVision/Classification/benchmark/LICENSE
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/README.md
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/README.md
+Table of Contents
+=================
+
+* [Table of Contents](#table-of-contents)
+* [Introduction](#introduction)
+* [Executing tests](#executing-tests)
+   * [PerfZero on private GCE instance.](#perfzero-on-private-gce-instance)
+      * [Step one: Create GCE Instance](#step-one-create-gce-instance)
+      * [Step two: Build docker on GCE instance](#step-two-build-docker-on-gce-instance)
+      * [Step three: Start and "enter" the docker instance](#step-three-start-and-enter-the-docker-instance)
+      * [Step four: Run tests](#step-four-run-tests)
+      * [Step five: Delete the instance when done](#step-five-delete-the-instance-when-done)
+   * [PerfZero on local workstation or any server](#perfzero-on-local-workstation-or-any-server)
+   * [PerfZero without docker](#perfzero-without-docker)
+* [Creating tests](#creating-tests)
+* [Deep dive into individual tools](#deep-dive-into-individual-tools)
+   * [Build docker image](#build-docker-image)
+   * [Run benchmark](#run-benchmark)
+   * [Instructions for managing Google Cloud Platform computing instance](#instructions-for-managing-google-cloud-platform-computing-instance)
+   * [Understand the benchmark execution output](#understand-the-benchmark-execution-output)
+      * [Json formatted benchmark summary](#json-formatted-benchmark-summary)
+      * [Profiling](#profiling)
+         * [Visualize in TensorBoard](#visualize-in-tensorboard)
+      * [Visualize system metric values over time](#visualize-system-metric-values-over-time)
+* [PerfZero development](#perfzero-development)
+
+# Introduction
+
+PerfZero is a benchmark framework for TensorFlow. It intends to address the
+following use-cases:
+
+1) For user who wants to execute TensorFlow test to debug performance
+regression.
+
+PerfZero makes it easy to execute the pre-defined test by consolidating the
+docker image build, GPU driver installation, TensorFlow installation, benchmark
+library checkout, data download, system statistics collection, benchmark metrics
+collection, profiler data collection and so on into 2 to 3 commands. This allows
+developer to focus on investigating the issue rather than setting up the test
+environment.
+
+2) For user who wants to track the performance change of TensorFlow for a
+variety of setup (e.g. GPU model, cudnn version, TensorFlow version)
+
+The developer can setup periodic job to execute these benchmark methods using
+PerfZero. PerfZero will collect the information needed to identify the
+benchmark (e.g. GPU model, TensorFlow version, dependent library git hash), get
+benchmark execution result (e.g. wall time, accuracy, succeeded or not),
+summarize the result in a easy-to-read json string and upload the result to
+bigquery table. Using the data in the bigquery table, user can then visualize
+the performance change in a dashboard, compare performance between different
+setup in a table, and trigger alert on when there is performance regression.
+
+
+# Executing tests
+
+There are multiple ways to use PerfZero to execute a test. Listed from highest
+to lowest abstraction.
+
+*   [PerfZero on private GCE instance](#perfzero-on-private-gce-instance)
+*   [PerfZero on local workstation or any server](#perfzero-on-local-workstation-or-any-server)
+*   [PerfZero without docker](#perfzero-without-docker)
+
+## PerfZero on private GCE instance.
+
+There are many variations on this approach, to get you started quickly the steps
+below detail setting up an 8xV100 setup with local nvme drives where training
+data would be stored. The only con to this setup is that it cannot be stopped
+and can only be deleted due to the local nvme drives.
+
+### Step one: Create GCE Instance
+
+Creates an 8xV100 instance with 4 nvme drives. Output of the command will
+provide the command to run to SSH to the machine. To set the project, zone, and
+other features read
+[cloud_manager tool details](https://github.com/tensorflow/benchmarks/tree/master/perfzero#instructions-for-managing-google-cloud-platform-computing-instance).
+
+```bash
+python perfzero/lib/cloud_manager.py create --accelerator_count 8 --nvme_count 4
+```
+
+### Step two: Build docker on GCE instance
+
+After logging into the instance run the following command to create a docker
+instance with the latest nightly TF 2.0 build. For more options read the
+[build docker image section](https://github.com/tensorflow/benchmarks/tree/master/perfzero#build-docker-image)
+
+```bash
+python3 perfzero/lib/setup.py --dockerfile_path=docker/Dockerfile_ubuntu_1804_tf_v2
+```
+
+For all options for building the docker image, including controlling the version
+of TensorFlow installed, checkout the public
+[README for PerfZero](https://github.com/tensorflow/benchmarks/tree/master/perfzero#build-docker-image).
+
+### Step three: Start and "enter" the docker instance
+
+```bash
+nvidia-docker run -it --rm -v $(pwd):/workspace -v /data:/data perfzero/tensorflow bash
+```
+
+### Step four: Run tests
+
+The command below pulls GitHub official/models, downloads the cifar-10 dataset
+from our internal Google Cloud storage bucket, and executes a ResNet56 benchmark
+with TensorFlow 2.0 nightly build. For info on the args read the
+[run benchmark section](https://github.com/tensorflow/benchmarks/tree/master/perfzero#run-benchmark).
+
+```bash
+python3 /workspace/perfzero/lib/benchmark.py \
+--git_repos="https://github.com/tensorflow/models.git;benchmark" \
+--python_path=models \
+--data_downloads="gs://tf-perf-imagenet-uswest1/tensorflow/cifar10_data/cifar-10-batches-bin" \
+--benchmark_methods=official.benchmark.keras_cifar_benchmark.Resnet56KerasBenchmarkReal.benchmark_1_gpu_no_dist_strat
+```
+
+For all options that can be used when executing a test checkout the public
+[README for PerfZero](https://github.com/tensorflow/benchmarks/tree/master/perfzero#run-benchmark).
+
+### Step five: Delete the instance when done
+
+```bash
+python perfzero/lib/cloud_manager.py delete
+```
+
+## PerfZero on local workstation or any server
+
+This approach is the same as PerfZero on personal GCE instance, just jump to
+Step two: Build docker on GCE instance.
+
+If the workstation does not have access to the PerfZero GCS bucket and does not
+need access, e.g. data is already copied locally via another method, passing
+`--gcloud_key_file_url=""` will skip attempting to download the key.
+
+A quick test that does not require accessing GCS for data is:
+
+```bash
+python3 /workspace/perfzero/lib/benchmark.py \
+--git_repos="https://github.com/tensorflow/models.git;benchmark" \
+--python_path=models \
+--gcloud_key_file_url="" \
+--benchmark_methods=official.benchmark.keras_cifar_benchmark.Resnet56KerasBenchmarkSynth.benchmark_1_gpu_no_dist_strat
+```
+
+## PerfZero without docker
+
+PerfZero is not dependent on Docker. Docker is used to handle dependencies and
+create a consistent environment. Most tests do not require much beyond
+TensorFlow and PerfZero mostly depends on Google Cloud, but only for downloading
+data and upload results if desired. While this approach works, we do not
+maintain a clear list of the required libraries. The Docker build files are a
+good starting point beyond.
+
+Once the requirements are met the command below can be executed which will pull
+GitHub official/models, downloads the cifar-10 dataset from our internal Google
+Cloud storage bucket, and executes a ResNet56 benchmark with TensorFlow 2.0
+nightly build.
+
+```bash
+python3 /workspace/perfzero/lib/benchmark.py \
+--git_repos="https://github.com/tensorflow/models.git;benchmark" \
+--python_path=models \
+--data_downloads="gs://tf-perf-imagenet-uswest1/tensorflow/cifar10_data/cifar-10-batches-bin" \
+--benchmark_methods=official.r1.resnet.estimator_benchmark.Resnet50EstimatorBenchmarkReal.benchmark_graph_1_gpu
+```
+
+# Creating tests
+
+Here are the instructions that developers of benchmark method needs to follow in
+order to run benchmark method in PerfZero. See
+[estimator_benchmark.py](https://github.com/tensorflow/models/blob/master/official/r1/resnet/estimator_benchmark.py)
+for example test code that supports PerfZero.
+
+1) The benchmark class should extend the TensorFlow python class
+[tensorflow.test.Benchmark](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/platform/benchmark.py). The benchmark class constructor should have a
+constructor with signature `__init__(self, output_dir, data_dir, **kwargs)`.
+Below is the usage for each arguments:
+
+- Benchmark method should put all generated files (e.g. logs) in `output_dir` so that PerfZero can
+upload these files to Google Cloud Storage when `--output_gcs_url` is specified.
+
+- Benchmark method should read data from `root_data_dir`. For example, the benchmark method can read data from e.g. `${root_data_dir}/cifar-10-binary`
+
+- `**kwargs` is useful to make the benchmark constructor forward compatible when PerfZero provides more named arguments to the benchmark constructor before
+  updating the benchmark class.
+
+
+2) At the end of the benchmark method execution, the method should call [report_benchmark()](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/platform/benchmark.py)
+with the following parameters:
+
+```
+tf.test.Benchmark.report_benchmark(
+  iters=num_iteration,          # The number of iterations of the benchmark.
+
+  wall_time=wall_time_sec,      # Total wall time in sec for all iterations.
+  metrics = [                   # List of metric entries
+    {
+      'name': 'accuracy_top_5', # Metric name
+      'value': 80,              # Metric value
+      'min_value': 90,          # Optional. Minimum acceptable metric value for the benchmark to succeed.
+      'max_value': 99           # Optional. Maximum acceptable metric value for the benchmark to succeed.
+    },
+    {
+      'name': 'accuracy_top_1',
+      'value': 99.5
+    }
+  ]
+)
+```
+
+This format allows PerfZero to specify whether the benchmark has succeeded
+(e.g.  for convergence test) in its summary based on the logic determined by
+the benchmark developer.
+
+
+3) Include dependent libraries in `--git_repos` and `--python_path`. These
+libraries will be checked-out in the directory
+`path_to_perfzero/workspace/site-packages` by default. Developer can edit these
+libraries directly and execute benchmark with the local change.
+
+
+# Deep dive into individual tools
+
+The sections below go into details about the individual components of PerfZero.
+
+## Build docker image
+
+The command below builds the docker image named `perfzero/tensorflow` which contains the
+libraries (e.g. TensorFlow) needed for benchmark.
+
+```
+python3 benchmarks/perfzero/lib/setup.py
+```
+
+Here are a few selected optional flags. Run `python3 setup.py -h` to see
+detailed documentation for all supported flags.
+
+1) Use `--dockerfile_path=docker/Dockerfile_ubuntu_1804_tf_v2` to build docker image for TensorFlow v2
+2) Use `--tensorflow_pip_spec` to specify the tensorflow pip package name (and optionally version) to be
+installed in the docker image, e.g. `--tensorflow_pip_spec=tensorflow==1.12.0`.
+
+
+## Run benchmark
+
+The command below executes the benchmark method specified by `--benchmark_methods`.
+
+```
+export ROOT_DATA_DIR=/data
+
+nvidia-docker run -it --rm -v $(pwd):/workspace -v $ROOT_DATA_DIR:$ROOT_DATA_DIR perfzero/tensorflow \
+python3 /workspace/benchmarks/perfzero/lib/benchmark.py \
+--gcloud_key_file_url="" \
+--git_repos="https://github.com/tensorflow/models.git;benchmark" \
+--python_path=models \
+--benchmark_methods=official.r1.resnet.estimator_benchmark.Resnet50EstimatorBenchmarkSynth.benchmark_graph_1_gpu \
+--root_data_dir=$ROOT_DATA_DIR
+```
+
+`${ROOT_DATA_DIR}` should be the directory which contains the dataset files
+required by the benchmark method. If the flag `--data_downloads` is specified,
+PerfZero will download files from the specified url to the directory specified
+by the flag `--root_data_dir`. Otherwise, user needs to manually download and
+move the dataset files into the directory specified by `--root_data_dirs`. The
+default `root_data_dir` is `/data`. Some benchmark methods, like the one run in
+the sample command above, do not require any dataset files.
+
+Here are a few selected optional flags. Run `python3 benchmark.py -h` to see
+detailed documentation for all supported flags.
+
+1) Use `--workspace=unique_workspace_name` if you need to run multiple benchmark
+using different workspace setup. One example usecase is that you may want to
+test a branch from a pull request without changing your existing workspace.
+
+2) Use `--debug` if you need to see the debug level logging
+
+3) Use `--git_repos="git_url;git_branch;git_hash"` to checkout a git repo with
+the specified git_branch at the specified git_hash to the local folder with the
+specified folder name. **Note that** the value of the flag `--git_repos` is
+wrapped by the quotation mark `"` so that `;` will not be interpreted by the
+bash as the end of the command. Specify the flag once for each repository you
+want to checkout.
+
+5) Use `--profiler_enabled_time=start_time:end_time` to collect profiler data
+during period `[start_time, end_time)` after the benchmark method execution
+starts. Skip `end_time` in the flag value to collect data until the end of
+benchmark method execution. See [here](#visualize-tensorflow-graph-etc-using-tensorboard)
+for instructions on how to use the generated profiler data.
+
+## Instructions for managing Google Cloud Platform computing instance
+
+PerfZero aims to make it easy to run and debug TensorFlow which is usually run
+with GPU. However, most users do not have dedicated machine with expensive
+hardware. One cost-effective solution is for users to create machine with the
+desired hardware on demand in a public cloud when they need to debug TensorFlow.
+
+We provide a script in PerfZero to make it easy to manage computing instance in
+Google Cloud Platform. This assumes that you have access to an existing project
+in GCP.
+
+Run `python perfzero/lib/cloud_manager.py -h` for list of commands supported
+by the script. Run `cloud_manager.py <command> -h` to see detailed documentation
+for all supported flags for the specified `command`.
+
+In most cases, user only needs to run the following commands:
+
+```
+# Create a new instance that is unique to your username
+python perfzero/lib/cloud_manager.py create --project=project_name
+
+# Query the status of the existing instanced created by your and its IP address
+python perfzero/lib/cloud_manager.py status --project=project_name
+
+# Stop the instance
+python perfzero/lib/cloud_manager.py stop --project=project_name
+
+# Start the instance
+python perfzero/lib/cloud_manager.py start --project=project_name
+
+# Delete the instance
+python perfzero/lib/cloud_manager.py delete --project=project_name
+```
+
+## Understand the benchmark execution output
+
+### Json formatted benchmark summary
+
+PerfZero outputs a json-formatted summary that provides the information needed
+to understand the benchmark result. The summary is printed in the stdout and
+in the file `path_to_perfzero/${workspace}/output/${execution_id}/perfzero.log`.
+
+Additionally, Perfzero outputs a pure json file containing the summary at
+`path_to_perfzero/${workspace}/output/${execution_id}/perfzero_summary.json`
+
+Here is an example output from PerfZero. Explanation is provided inline for each
+key when the name of the key is not sufficiently self-explanary.
+
+```
+ {
+  "ml_framework_info": {                         # Summary of the machine learning framework
+    "version": "1.13.0-dev20190206",             # Short version. It is tf.__version__ for TensorFlow
+    "name": "tensorflow",                        # Machine learning framework name such as PyTorch
+    "build_label": "ml_framework_build_label",   # Specified by the flag --ml_framework_build_label
+    "build_version": "v1.12.0-7504-g9b32b5742b"  # Long version. It is tf.__git_version__ for TensorFlow
+  },
+  "execution_timestamp": 1550040322.8991697,     # Timestamp when the benchmark is executed
+  "execution_id": "2019-02-13-06-45-22-899128",  # A string that uniquely identify this benchmark execution
+
+  "benchmark_info": {                            # Summary of the benchmark framework setup
+    "output_url": "gs://tf-performance/test-results/2019-02-13-06-45-22-899128/",     # Google storage url that contains the log file from this benchmark execution
+    "has_exception": false,
+    "site_package_info": {
+      "models": {
+        "branch": "benchmark",
+        "url": "https://github.com/tensorflow/models.git",
+        "hash": "f788046ca876a8820e05b0b48c1fc2e16b0955bc"
+      },
+      "benchmarks": {
+        "branch": "master",
+        "url": "https://github.com/tensorflow/benchmarks.git",
+        "hash": "af9e0ef36fc6867d9b63ebccc11f229375cd6a31"
+      }
+    },
+    "harness_name": "perfzero",
+    "harness_info": {
+      "url": "https://github.com/tensorflow/benchmarks.git",
+      "branch": "master",
+      "hash": "75d2991b88630dde10ef65aad8082a6d5cd8b5fc"
+    },
+    "execution_label": "execution_label"      # Specified by the flag --execution_label
+  },
+
+  "system_info": {                            # Summary of the resources in the system that is used to execute the benchmark
+    "system_name": "system_name",             # Specified by the flag --system_name
+    "accelerator_count": 2,                   # Number of GPUs in the system
+    "physical_cpu_count": 8,                  # Number of physical cpu cores in the system. Hyper thread CPUs are excluded.
+    "logical_cpu_count": 16,                  # Number of logical cpu cores in the system. Hyper thread CPUs are included.
+    "cpu_socket_count": 1,                    # Number of cpu socket in the system.
+    "platform_name": "platform_name",         # Specified by the flag --platform_name
+    "accelerator_model": "Tesla V100-SXM2-16GB",
+    "accelerator_driver_version": "410.48",
+    "cpu_model": "Intel(R) Xeon(R) CPU @ 2.20GHz"
+  },
+
+  "process_info": {                           # Summary of the resources used by the process to execute the benchmark
+    "max_rss": 4269047808,                    # maximum physical memory in bytes used by the process
+    "max_vms": 39894450176,                   # maximum virtual memory in bytes used by the process
+    "max_cpu_percent": 771.1                  # CPU utilization as a percentage. See psutil.Process.cpu_percent() for more information
+  },
+
+  "benchmark_result": {                       # Summary of the benchmark execution results. This is pretty much the same data structure defined in test_log.proto.
+                                              # Most values are read from test_log.proto which is written by tf.test.Benchmark.report_benchmark() defined in TensorFlow library.
+
+    "metrics": [                              # This is derived from `extras` [test_log.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto)
+                                              # which is written by report_benchmark().
+                                              # If the EntryValue is double, then name is the extra's key and value is extra's double value.
+                                              # If the EntryValue is string, then name is the extra's key. The string value will be a json formated string whose keys
+                                              # include `value`, `succeeded` and `description`. Benchmark method can provide arbitrary metric key/value pairs here.
+      {
+        "name": "accuracy_top_5",
+        "value": 0.7558000087738037
+      },
+      {
+        "name": "accuracy_top_1",
+        "value": 0.2639999985694885
+      }
+    ],
+    "name": "official.resnet.estimator_cifar_benchmark.EstimatorCifar10BenchmarkTests.unit_test",    # Full path to the benchmark method, i.e. module_path.class_name.method_name
+    "succeeded": true,                        # True iff benchmark method execution finishes without exception and no metric in metrics show succeeded = false
+    "wall_time": 14.552583694458008           # The value is determined by tf.test.Benchmark.report_benchmark() called by the benchmark method. It is -1 if report_benchmark() is not called.
+  }
+}
+```
+
+### Profiling
+
+When the flag `--profiler_enabled_time=start_time:end_time` is specified, the
+profiler data will be collected and stored in
+`path_to_perfzero/${workspace}/output/${execution_id}/profiler_data`.
+
+#### Visualize in TensorBoard
+
+Firstly, install the profile plugin for TensorBoard.
+```
+pip install -U tensorboard-plugin-profile
+```
+
+Run `tensorboard --logdir=path_to_perfzero/workspace/output/${execution_id}/profiler_data` or
+`python3 -m tensorboard.main --logdir=path_to_perfzero/workspace/output/${execution_id}/profiler_data` to open
+TensorBoard server.
+
+If PerfZero is executed on a remote machine, run `ssh -L
+6006:127.0.0.1:6006 remote_ip` before opening `http://localhost:6006` in your
+browser to access the TensorBoard UI.
+
+You can also run TensorBoard inside the docker, e.g.
+`tensorboard --logdir=/workspace/perfzero/workspace/output/${execution_id}/profiler_data --bind_all`
+
+In this case, you have to start docker with port mapping, i.e. with "-p 6006:6006" flag, .e.g
+```
+nvidia-docker run -it --rm -v $(pwd):/workspace -p 6006:6006 perfzero/tensorflow
+```
+
+Normally, the pages you see will look like:
+
+![Screenshot](screenshots/profiling_overview.png "Profiling Overview")
+![Screenshot](screenshots/profiling_trace_view.png "Profiling Trace View")
+
+### Visualize system metric values over time
+
+PerfZero also records a few useful system metrics (e.g. rss, vms) over time in
+the file `path_to_perfzero/${workspace}/output/${execution_id}/process_info.log`.
+Run `python perfzero/scripts/plot_process_info.py process_info.log` to generate a
+pdf showing the value of these metrics over time.
+
+
+# PerfZero development
+
+
+Avoid importing `tensorflow` package in any place that requires the `logging`
+package because tensorflow package appears to prevent logging from working
+properly. Importing `tensorflow` package only in the method that requires it.
+
+Here are the commands to run unit tests and check code style.
+
+```
+# Run all unit tests
+# This must be executed in directory perfzero/lib
+python3 -B -m unittest discover -p "*_test.py"
+
+# Format python code in place
+find perfzero/lib -name *.py -exec pyformat --in_place {} \;
+
+# Check python code format and report warning and errors
+find perfzero/lib -name *.py -exec gpylint3 {} \;
+```
+
+Here is the command to generate table-of-contents for this README. Run this
+command and copy/paste it to the README.md.
+
+```
+./perfzero/scripts/generate-readme-header.sh perfzero/README.md
+```
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_build
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_build
+# Ubuntu 18.04 Python3 with CUDA 10 and the following:
+#  - Installs tf-nightly-gpu-2.0-preview
+#  - Installs requirements.txt for tensorflow/models
+#  - Install bazel for building TF from source
+
+FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu-2.0-preview"
+ARG extra_pip_specs=""
+ARG local_tensorflow_pip_spec=""
+
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-dev-10-0 \
+        cuda-cufft-dev-10-0 \
+        cuda-curand-dev-10-0 \
+        cuda-cusolver-dev-10-0 \
+        cuda-cusparse-dev-10-0 \
+        libcudnn7=7.6.2.24-1+cuda10.0  \
+        libcudnn7-dev=7.6.2.24-1+cuda10.0  \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl \
+        && \
+  find /usr/local/cuda-10.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
+    libnvinfer-dev=5.1.5-1+cuda10.0 \
+    && apt-get clean
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      build-essential \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+# Install / update Python
+# (bulding TF needs py2 even if building for Python3 as of 06-AUG-2019)
+RUN apt-get install -y --no-install-recommends \
+      python3 \
+      python3-dev \
+      python3-pip \
+      python3-setuptools \
+      python3-venv \
+      python
+
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN pip3 install --upgrade pip
+# setuptools upgraded to fix install requirements from model garden.
+RUN pip install wheel
+RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
+RUN pip install absl-py
+RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+RUN pip install tfds-nightly
+RUN pip install -U scikit-learn
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN pip3 install -r /tmp/requirements.txt
+
+RUN pip3 freeze
+
+# Install bazel
+ARG BAZEL_VERSION=0.24.1
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
+
+RUN git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_s4tf_cuda10.0
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_s4tf_cuda10.0
+# Ubuntu 18.04 Python3 with CUDA 10 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.0)
+#  - Installs requirements.txt for tensorflow/models
+# Additionally also installs:
+#  - Latest S4TF development snapshot for cuda 10.0
+
+FROM nvidia/cuda:10.0-base-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+ARG swift_tf_url=https://storage.googleapis.com/swift-tensorflow-artifacts/nightlies/latest/swift-tensorflow-DEVELOPMENT-cuda10.0-cudnn7-ubuntu18.04.tar.gz
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-10-0 \
+        cuda-cublas-dev-10-0 \
+        cuda-cufft-10-0 \
+        cuda-curand-10-0 \
+        cuda-cusolver-10-0 \
+        cuda-cusparse-10-0 \
+        libcudnn7=7.6.2.24-1+cuda10.0  \
+        libcudnn7-dev=7.6.2.24-1+cuda10.0  \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.0 \
+    libnvinfer-dev=5.1.5-1+cuda10.0 \
+    && apt-get clean
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3 \
+      python3-dev \
+      python3-pip \
+      python3-setuptools \
+      python3-venv
+
+
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN pip3 install --upgrade pip
+# setuptools upgraded to fix install requirements from model garden.
+RUN pip install wheel
+RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery mock
+RUN pip install absl-py
+RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+RUN pip install tfds-nightly
+RUN pip install -U scikit-learn
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+RUN pip freeze
+
+### Install Swift deps.
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        git \
+        python \
+        python-dev \
+        python-pip \
+        python-setuptools \
+        python-tk \
+        python3 \
+        python3-pip \
+        python3-setuptools \
+        clang \
+        libcurl4-openssl-dev \
+        libicu-dev \
+        libpython-dev \
+        libpython3-dev \
+        libncurses5-dev \
+        libxml2 \
+        libblocksruntime-dev
+
+# Download and extract S4TF
+WORKDIR /swift-tensorflow-toolchain
+RUN if ! curl -fSsL --retry 5 $swift_tf_url -o swift.tar.gz; \
+    then sleep 30 && curl -fSsL --retry 5 $swift_tf_url -o swift.tar.gz; \
+    fi;
+
+RUN mkdir usr \
+    && tar -xzf swift.tar.gz --directory=usr --strip-components=1 \
+    && rm swift.tar.gz
+ENV PATH="/swift-tensorflow-toolchain/usr/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/swift-tensorflow-toolchain/usr/lib/swift/linux/:${LD_LIBRARY_PATH}"
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_s4tf_cuda10.1
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_s4tf_cuda10.1
+# Ubuntu 18.04 Python3 with CUDA 10.1 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.1)
+#  - Installs requirements.txt for tensorflow/models
+#  - TF 2.0 tested with cuda 10.0, but we need to test tf 2.1 with cuda 10.1.
+# Additionally also installs
+#  - Latest S4TF development snapshot for cuda 10.1
+
+FROM nvidia/cuda:10.1-base-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+ARG swift_tf_url=https://storage.googleapis.com/swift-tensorflow-artifacts/nightlies/latest/swift-tensorflow-DEVELOPMENT-cuda10.1-cudnn7-stock-ubuntu18.04.tar.gz
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+# In the future, add the following lines in a shell script running on the
+# benchmark vm to get the available dependent versions when updating cuda
+# version (e.g to 10.2 or something later):
+# sudo apt-cache search cuda-command-line-tool
+# sudo apt-cache search cuda-cublas
+# sudo apt-cache search cuda-cufft
+# sudo apt-cache search cuda-curand
+# sudo apt-cache search cuda-cusolver
+# sudo apt-cache search cuda-cusparse
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-10-1 \
+        cuda-cufft-10-1 \
+        cuda-curand-10-1 \
+        cuda-cusolver-10-1 \
+        cuda-cusparse-10-1 \
+        libcudnn7=7.6.4.38-1+cuda10.1  \
+        libcudnn7-dev=7.6.4.38-1+cuda10.1  \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.1 \
+    libnvinfer-dev=5.1.5-1+cuda10.1 \
+    libnvinfer6=6.0.1-1+cuda10.1 \
+    && apt-get clean
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3 \
+      python3-dev \
+      python3-pip \
+      python3-setuptools \
+      python3-venv
+
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN pip3 install --upgrade pip
+# setuptools upgraded to fix install requirements from model garden.
+RUN pip install wheel
+RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
+RUN pip install absl-py
+RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+RUN pip install tfds-nightly
+RUN pip install -U scikit-learn
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+RUN pip freeze
+
+### Install Swift deps.
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        git \
+        python \
+        python-dev \
+        python-pip \
+        python-setuptools \
+        python-tk \
+        python3 \
+        python3-pip \
+        python3-setuptools \
+        clang \
+        libcurl4-openssl-dev \
+        libicu-dev \
+        libpython-dev \
+        libpython3-dev \
+        libncurses5-dev \
+        libxml2 \
+        libblocksruntime-dev
+
+# Download and extract S4TF
+WORKDIR /swift-tensorflow-toolchain
+RUN if ! curl -fSsL --retry 5 $swift_tf_url -o swift.tar.gz; \
+    then sleep 30 && curl -fSsL --retry 5 $swift_tf_url -o swift.tar.gz; \
+    fi;
+
+RUN mkdir usr \
+    && tar -xzf swift.tar.gz --directory=usr --strip-components=1 \
+    && rm swift.tar.gz
+ENV PATH="/swift-tensorflow-toolchain/usr/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/swift-tensorflow-toolchain/usr/lib/swift/linux/:${LD_LIBRARY_PATH}"
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_s4tf_cuda11.0
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_s4tf_cuda11.0
+# Ubuntu 18.04 Python3 with CUDA 11.0 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.4)
+#  - Installs requirements.txt for tensorflow/models
+# Additionally also installs
+#  - Latest S4TF development snapshot for cuda 11.0
+
+FROM nvidia/cuda:11.0-base-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+ARG swift_tf_url=https://storage.googleapis.com/swift-tensorflow-artifacts/nightlies/latest/swift-tensorflow-DEVELOPMENT-cuda11.0-cudnn8-stock-ubuntu18.04.tar.gz
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn8-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+# In the future, add the following lines in a shell script running on the
+# benchmark vm to get the available dependent versions when updating cuda
+# version (e.g to 10.2 or something later):
+# sudo apt-cache search cuda-command-line-tool
+# sudo apt-cache search cuda-cublas
+# sudo apt-cache search cuda-cufft
+# sudo apt-cache search cuda-curand
+# sudo apt-cache search cuda-cusolver
+# sudo apt-cache search cuda-cusparse
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-tools-11-0 \
+        cuda-toolkit-11-0 \
+        libcudnn8=8.0.4.30-1+cuda11.0  \
+        libcudnn8-dev=8.0.4.30-1+cuda11.0  \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libnvinfer7=7.2.0-1+cuda11.0 \
+    libnvinfer-dev=7.2.0-1+cuda11.0 \
+    && apt-get clean
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3 \
+      python3-dev \
+      python3-pip \
+      python3-setuptools \
+      python3-venv
+
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN pip3 install --upgrade pip
+# setuptools upgraded to fix install requirements from model garden.
+RUN pip install wheel
+RUN pip install --upgrade setuptools google-api-python-client pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
+RUN pip install absl-py
+RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+RUN pip install tfds-nightly
+RUN pip install -U scikit-learn
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+RUN pip freeze
+
+### Install Swift deps.
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        git \
+        python \
+        python-dev \
+        python-pip \
+        python-setuptools \
+        python-tk \
+        python3 \
+        python3-pip \
+        python3-setuptools \
+        clang \
+        libcurl4-openssl-dev \
+        libicu-dev \
+        libpython-dev \
+        libpython3-dev \
+        libncurses5-dev \
+        libxml2 \
+        libblocksruntime-dev
+
+# Download and extract S4TF
+WORKDIR /swift-tensorflow-toolchain
+RUN if ! curl -fSsL --retry 5 $swift_tf_url -o swift.tar.gz; \
+    then sleep 30 && curl -fSsL --retry 5 $swift_tf_url -o swift.tar.gz; \
+    fi;
+
+RUN mkdir usr \
+    && tar -xzf swift.tar.gz --directory=usr --strip-components=1 \
+    && rm swift.tar.gz
+ENV PATH="/swift-tensorflow-toolchain/usr/bin:${PATH}"
+ENV LD_LIBRARY_PATH="/swift-tensorflow-toolchain/usr/lib/swift/linux/:${LD_LIBRARY_PATH}"
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_testing_apiclient
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_testing_apiclient
+# Ubuntu 18.04 Python3 with CUDA 10.1 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.1)
+#  - Installs requirements.txt for tensorflow/models
+#  - TF 2.0 tested with cuda 10.0, but we need to test tf 2.1 with cuda 10.1.
+
+FROM nvidia/cuda:10.1-base-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+# In the future, add the following lines in a shell script running on the
+# benchmark vm to get the available dependent versions when updating cuda
+# version (e.g to 10.2 or something later):
+# sudo apt-cache search cuda-command-line-tool
+# sudo apt-cache search cuda-cublas
+# sudo apt-cache search cuda-cufft
+# sudo apt-cache search cuda-curand
+# sudo apt-cache search cuda-cusolver
+# sudo apt-cache search cuda-cusparse
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-10-1 \
+        cuda-cufft-10-1 \
+        cuda-curand-10-1 \
+        cuda-cusolver-10-1 \
+        cuda-cusparse-10-1 \
+        libcudnn7=7.6.4.38-1+cuda10.1  \
+        libcudnn7-dev=7.6.4.38-1+cuda10.1  \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libnvinfer5=5.1.5-1+cuda10.1 \
+    libnvinfer-dev=5.1.5-1+cuda10.1 \
+    libnvinfer6=6.0.1-1+cuda10.1 \
+    && apt-get clean
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3 \
+      python3-dev \
+      python3-pip \
+      python3-setuptools \
+      python3-venv
+
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN pip3 install --upgrade pip
+# setuptools upgraded to fix install requirements from model garden.
+RUN pip install wheel
+RUN pip install --upgrade setuptools google-api-python-client==1.8.0 pyyaml google-cloud google-cloud-bigquery mock
+RUN pip install absl-py
+RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+RUN pip install tfds-nightly
+RUN pip install -U scikit-learn
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+RUN pip freeze
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11
+# Ubuntu 18.04 Python3 with CUDA 11 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.3)
+#  - Installs requirements.txt for tensorflow/models
+
+FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+ENV PIP_CMD="python3.9 -m pip"
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+# In the future, add the following lines in a shell script running on the
+# benchmark vm to get the available dependent versions when updating cuda
+# version (e.g to 10.2 or something later):
+# sudo apt-cache search cuda-command-line-tool
+# sudo apt-cache search cuda-cublas
+# sudo apt-cache search cuda-cufft
+# sudo apt-cache search cuda-curand
+# sudo apt-cache search cuda-cusolver
+# sudo apt-cache search cuda-cusparse
+
+# Needed to disable prompts during installation.
+ENV DEBIAN_FRONTEND noninteractive
+
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+# Python 3.9 related deps in this ppa.
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3.9 \
+      python3-pip \
+      python3.9-dev \
+      python3-setuptools \
+      python3.9-venv \
+      python3.9-distutils \
+      python3.9-lib2to3
+      
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN ${PIP_CMD} install --upgrade pip
+RUN ${PIP_CMD} install --upgrade distlib
+# setuptools upgraded to fix install requirements from model garden.
+RUN ${PIP_CMD} install --upgrade setuptools
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.2/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+RUN ${PIP_CMD} install --upgrade pyyaml
+RUN ${PIP_CMD} install --upgrade google-api-python-client==1.8.0
+RUN ${PIP_CMD} install --upgrade google-cloud google-cloud-bigquery google-cloud-datastore mock
+
+
+RUN ${PIP_CMD} install wheel
+RUN ${PIP_CMD} install absl-py
+RUN ${PIP_CMD} install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+
+RUN ${PIP_CMD} install tfds-nightly
+RUN ${PIP_CMD} install -U scikit-learn
+
+# Install dependnecies needed for tf.distribute test utils
+RUN ${PIP_CMD} install dill tblib portpicker
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN ${PIP_CMD} install -r /tmp/requirements.txt
+
+RUN ${PIP_CMD} install tf-estimator-nightly
+RUN ${PIP_CMD} install tensorflow-text-nightly
+
+# RUN nvidia-smi
+
+RUN nvcc --version
+
+
+RUN pip freeze
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11_0
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11_0
+# Ubuntu 18.04 Python3 with CUDA 11 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.3)
+#  - Installs requirements.txt for tensorflow/models
+
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+# In the future, add the following lines in a shell script running on the
+# benchmark vm to get the available dependent versions when updating cuda
+# version (e.g to 10.2 or something later):
+# sudo apt-cache search cuda-command-line-tool
+# sudo apt-cache search cuda-cublas
+# sudo apt-cache search cuda-cufft
+# sudo apt-cache search cuda-curand
+# sudo apt-cache search cuda-cusolver
+# sudo apt-cache search cuda-cusparse
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-tools-11-0 \
+        cuda-toolkit-11-0 \
+        libcudnn8=8.0.4.30-1+cuda11.0  \
+        libcudnn8-dev=8.0.4.30-1+cuda11.0  \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3 \
+      python3-dev \
+      python3-pip \
+      python3-setuptools \
+      python3-venv
+
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN pip3 install --upgrade pip
+# setuptools upgraded to fix install requirements from model garden.
+RUN pip install wheel
+RUN pip install --upgrade setuptools google-api-python-client==1.8.0 pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
+RUN pip install absl-py
+RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+RUN pip install tfds-nightly
+RUN pip install -U scikit-learn
+# Install dependnecies needed for tf.distribute test utils
+RUN pip install dill tblib portpicker
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+RUN pip install tf-estimator-nightly
+RUN pip install tensorflow-text-nightly
+
+# RUN nvidia-smi
+
+RUN nvcc --version
+
+
+RUN pip freeze
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11_py36
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11_py36
+# Ubuntu 18.04 Python3 with CUDA 11 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.3)
+#  - Installs requirements.txt for tensorflow/models
+
+FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+# In the future, add the following lines in a shell script running on the
+# benchmark vm to get the available dependent versions when updating cuda
+# version (e.g to 10.2 or something later):
+# sudo apt-cache search cuda-command-line-tool
+# sudo apt-cache search cuda-cublas
+# sudo apt-cache search cuda-cufft
+# sudo apt-cache search cuda-curand
+# sudo apt-cache search cuda-cusolver
+# sudo apt-cache search cuda-cusparse
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.2/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3 \
+      python3-dev \
+      python3-pip \
+      python3-setuptools \
+      python3-venv
+
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN pip3 install --upgrade pip
+# setuptools upgraded to fix install requirements from model garden.
+RUN pip install wheel
+RUN pip install --upgrade setuptools google-api-python-client==1.8.0 pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
+RUN pip install absl-py
+RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+RUN pip install tfds-nightly
+RUN pip install -U scikit-learn
+# Install dependnecies needed for tf.distribute test utils
+RUN pip install dill tblib portpicker
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+RUN pip install tf-estimator-nightly
+RUN pip install tensorflow-text-nightly
+RUN pip install psutil
+
+# RUN nvidia-smi
+
+RUN nvcc --version
+
+
+RUN pip freeze
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11_rollback
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11_rollback
+# Ubuntu 18.04 Python3 with CUDA 11 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.3)
+#  - Installs requirements.txt for tensorflow/models
+
+FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+ENV PIP_CMD="python3.9 -m pip"
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+# In the future, add the following lines in a shell script running on the
+# benchmark vm to get the available dependent versions when updating cuda
+# version (e.g to 10.2 or something later):
+# sudo apt-cache search cuda-command-line-tool
+# sudo apt-cache search cuda-cublas
+# sudo apt-cache search cuda-cufft
+# sudo apt-cache search cuda-curand
+# sudo apt-cache search cuda-cusolver
+# sudo apt-cache search cuda-cusparse
+
+# Needed to disable prompts during installation.
+ENV DEBIAN_FRONTEND noninteractive
+
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+# Python 3.9 related deps in this ppa.
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3.9 \
+      python3-pip \
+      python3.9-dev \
+      python3-setuptools \
+      python3.9-venv \
+      python3.9-distutils \
+      python3.9-lib2to3
+      
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN ${PIP_CMD} install --upgrade pip
+RUN ${PIP_CMD} install --upgrade distlib
+# setuptools upgraded to fix install requirements from model garden.
+RUN ${PIP_CMD} install --upgrade setuptools
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.2/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+RUN ${PIP_CMD} install --upgrade pyyaml
+RUN ${PIP_CMD} install --upgrade google-api-python-client==1.8.0
+RUN ${PIP_CMD} install --upgrade google-cloud google-cloud-bigquery google-cloud-datastore mock
+
+
+RUN ${PIP_CMD} install wheel
+RUN ${PIP_CMD} install absl-py
+RUN ${PIP_CMD} install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+
+RUN ${PIP_CMD} install tfds-nightly
+RUN ${PIP_CMD} install -U scikit-learn
+
+# Install dependnecies needed for tf.distribute test utils
+RUN ${PIP_CMD} install dill tblib portpicker
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN ${PIP_CMD} install -r /tmp/requirements.txt
+
+RUN ${PIP_CMD} install tf-estimator-nightly
+RUN ${PIP_CMD} install tensorflow-text-nightly
+RUN ${PIP_CMD} install keras-nightly==2.7.0.dev2021082607
+
+# RUN nvidia-smi
+
+RUN nvcc --version
+
+
+RUN pip freeze
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11_rollback_2
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_11_rollback_2
+# Ubuntu 18.04 Python3 with CUDA 11 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.3)
+#  - Installs requirements.txt for tensorflow/models
+
+FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+ENV PIP_CMD="python3.9 -m pip"
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+# In the future, add the following lines in a shell script running on the
+# benchmark vm to get the available dependent versions when updating cuda
+# version (e.g to 10.2 or something later):
+# sudo apt-cache search cuda-command-line-tool
+# sudo apt-cache search cuda-cublas
+# sudo apt-cache search cuda-cufft
+# sudo apt-cache search cuda-curand
+# sudo apt-cache search cuda-cusolver
+# sudo apt-cache search cuda-cusparse
+
+# Needed to disable prompts during installation.
+ENV DEBIAN_FRONTEND noninteractive
+
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+# Python 3.9 related deps in this ppa.
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3.9 \
+      python3-pip \
+      python3.9-dev \
+      python3-setuptools \
+      python3.9-venv \
+      python3.9-distutils \
+      python3.9-lib2to3
+      
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN ${PIP_CMD} install --upgrade pip
+RUN ${PIP_CMD} install --upgrade distlib
+# setuptools upgraded to fix install requirements from model garden.
+RUN ${PIP_CMD} install --upgrade setuptools
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.2/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+RUN ${PIP_CMD} install --upgrade pyyaml
+RUN ${PIP_CMD} install --upgrade google-api-python-client==1.8.0
+RUN ${PIP_CMD} install --upgrade google-cloud google-cloud-bigquery google-cloud-datastore mock
+
+
+RUN ${PIP_CMD} install wheel
+RUN ${PIP_CMD} install absl-py
+RUN ${PIP_CMD} install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+
+RUN ${PIP_CMD} install tfds-nightly
+RUN ${PIP_CMD} install -U scikit-learn
+
+# Install dependnecies needed for tf.distribute test utils
+RUN ${PIP_CMD} install dill tblib portpicker
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN ${PIP_CMD} install -r /tmp/requirements.txt
+
+RUN ${PIP_CMD} install tf-estimator-nightly
+RUN ${PIP_CMD} install tensorflow-text-nightly
+RUN ${PIP_CMD} install keras-nightly==2.7.0.dev2021070900
+
+# RUN nvidia-smi
+
+RUN nvcc --version
+
+
+RUN pip freeze
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_testing
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/perfzero/docker/Dockerfile_ubuntu_1804_tf_cuda_testing
+# Ubuntu 18.04 Python3 with CUDA 11 and the following:
+#  - Installs tf-nightly-gpu (this is TF 2.3)
+#  - Installs requirements.txt for tensorflow/models
+
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 as base
+ARG tensorflow_pip_spec="tf-nightly-gpu"
+ARG local_tensorflow_pip_spec=""
+ARG extra_pip_specs=""
+
+# setup.py passes the base path of local .whl file is chosen for the docker image.
+# Otherwise passes an empty existing file from the context.
+COPY ${local_tensorflow_pip_spec} /${local_tensorflow_pip_spec}
+
+# Pick up some TF dependencies
+# cublas-dev and libcudnn7-dev only needed because of libnvinfer-dev which may not
+# really be needed.
+# In the future, add the following lines in a shell script running on the
+# benchmark vm to get the available dependent versions when updating cuda
+# version (e.g to 10.2 or something later):
+# sudo apt-cache search cuda-command-line-tool
+# sudo apt-cache search cuda-cublas
+# sudo apt-cache search cuda-cufft
+# sudo apt-cache search cuda-curand
+# sudo apt-cache search cuda-cusolver
+# sudo apt-cache search cuda-cusparse
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-tools-11-0 \
+        cuda-toolkit-11-0 \
+        libcudnn8=8.0.4.30-1+cuda11.0  \
+        libcudnn8-dev=8.0.4.30-1+cuda11.0  \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        libpng-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        lsb-core \
+        curl
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add google-cloud-sdk to the source list
+RUN echo "deb http://packages.cloud.google.com/apt cloud-sdk-$(lsb_release -c -s) main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
+
+# Install extras needed by most models
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      git \
+      ca-certificates \
+      wget \
+      htop \
+      zip \
+      google-cloud-sdk
+
+# Install / update Python and Python3
+RUN apt-get install -y --no-install-recommends \
+      python3 \
+      python3-dev \
+      python3-pip \
+      python3-setuptools \
+      python3-venv
+
+# Upgrade pip, need to use pip3 and then pip after this or an error
+# is thrown for no main found.
+RUN pip3 install --upgrade pip
+# setuptools upgraded to fix install requirements from model garden.
+RUN pip install wheel
+RUN pip install --upgrade setuptools google-api-python-client==1.8.0 pyyaml google-cloud google-cloud-bigquery google-cloud-datastore mock
+RUN pip install absl-py
+RUN pip install --upgrade --force-reinstall ${tensorflow_pip_spec} ${extra_pip_specs}
+RUN pip install tfds-nightly
+RUN pip install -U scikit-learn
+# Install dependnecies needed for tf.distribute test utils
+RUN pip install dill tblib portpicker
+
+RUN curl https://raw.githubusercontent.com/tensorflow/models/master/official/requirements.txt > /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+
+RUN pip install tf-estimator-nightly
+RUN pip install tensorflow-text-nightly
+
+# RUN nvidia-smi
+
+RUN nvcc --version
+
+
+RUN pip freeze