Merge branch 'dtk21.10.1_v1' into 'main'

update some TF file See merge request dcutoolkit/deeplearing/dlexamples_new!5

Merge branch 'dtk21.10.1_v1' into 'main'
update some TF file See merge request dcutoolkit/deeplearing/dlexamples_new!5
7f99c1c3 · huchen · 6b6f8b0c · cf66c525 · 7f99c1c3 · 7f99c1c3
Commit 7f99c1c3 authored Apr 15, 2022 by huchen
20 changed files
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/platforms/default/util.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/platforms/default/util.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility code for the default platform."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import tempfile
+import cnn_util
+from models import model_config
+_ROOT_PROJECT_DIR = os.path.dirname(cnn_util.__file__)
+def define_platform_params():
+  """Defines platform-specific parameters.
+  Currently there are no platform-specific parameters to be defined.
+  """
+  pass
+def get_cluster_manager(params, config_proto):
+  """Returns the cluster manager to be used."""
+  return cnn_util.GrpcClusterManager(params, config_proto)
+def get_command_to_run_python_module(module):
+  """Returns a command to run a Python module."""
+  python_interpretter = sys.executable
+  if not python_interpretter:
+    raise ValueError('Could not find Python interpreter')
+  return [python_interpretter,
+          os.path.join(_ROOT_PROJECT_DIR, module + '.py')]
+def get_test_output_dir():
+  """Returns a directory where test outputs should be placed."""
+  base_dir = os.environ.get('TEST_OUTPUTS_DIR',
+                            '/tmp/tf_cnn_benchmarks_test_outputs')
+  if not os.path.exists(base_dir):
+    os.mkdir(base_dir)
+  return tempfile.mkdtemp(dir=base_dir)
+def get_test_data_dir():
+  """Returns the path to the test_data directory."""
+  return os.path.join(_ROOT_PROJECT_DIR, 'test_data')
+def get_ssd_backborn_model_file():
+  raise NotImplementedError
+def get_ssd_backboard_data_dir():
+  raise NotImplementedError
+def _initialize(params, config_proto):
+  del params, config_proto
+  model_config.register_tf1_models()
+_is_initalized = False
+def initialize(params, config_proto):
+  global _is_initalized
+  if _is_initalized:
+    return
+  _is_initalized = True
+  _initialize(params, config_proto)
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/platforms/util.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/platforms/util.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility code for a certain platform.
+This file simply imports everything from the default platform. To switch to a
+different platform, the import statement can be changed to point to a new
+platform.
+Creating a custom platform can be useful to, e.g., run some initialization code
+required by the platform or register a platform-specific model.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from platforms.default.util import *  # pylint: disable=unused-import,wildcard-import
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/preprocessing.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/preprocessing.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Image pre-processing utilities.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+# pylint: disable=g-direct-tensorflow-import
+import cnn_util
+from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.framework import function
+from tensorflow.python.layers import utils
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.platform import gfile
+import mlperf
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields:
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+  Returns:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    text: Tensor tf.string containing the human-readable label.
+  """
+  # Dense features in Example proto.
+  feature_map = {
+      'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
+                                          default_value=''),
+      'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64,
+                                              default_value=-1),
+      'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
+                                             default_value=''),
+  }
+  sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+      {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                   'image/object/bbox/ymin',
+                                   'image/object/bbox/xmax',
+                                   'image/object/bbox/ymax']})
+  features = tf.parse_single_example(example_serialized, feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(bbox, [0, 2, 1])
+  return features['image/encoded'], label, bbox, features['image/class/text']
+_RESIZE_METHOD_MAP = {
+    'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR,
+    'bilinear': tf.image.ResizeMethod.BILINEAR,
+    'bicubic': tf.image.ResizeMethod.BICUBIC,
+    'area': tf.image.ResizeMethod.AREA
+}
+def get_image_resize_method(resize_method, batch_position=0):
+  """Get tensorflow resize method.
+  If resize_method is 'round_robin', return different methods based on batch
+  position in a round-robin fashion. NOTE: If the batch size is not a multiple
+  of the number of methods, then the distribution of methods will not be
+  uniform.
+  Args:
+    resize_method: (string) nearest, bilinear, bicubic, area, or round_robin.
+    batch_position: position of the image in a batch. NOTE: this argument can
+      be an integer or a tensor
+  Returns:
+    one of resize type defined in tf.image.ResizeMethod.
+  """
+  if resize_method != 'round_robin':
+    return _RESIZE_METHOD_MAP[resize_method]
+  # return a resize method based on batch position in a round-robin fashion.
+  resize_methods = list(_RESIZE_METHOD_MAP.values())
+  def lookup(index):
+    return resize_methods[index]
+  def resize_method_0():
+    return utils.smart_cond(batch_position % len(resize_methods) == 0,
+                            lambda: lookup(0), resize_method_1)
+  def resize_method_1():
+    return utils.smart_cond(batch_position % len(resize_methods) == 1,
+                            lambda: lookup(1), resize_method_2)
+  def resize_method_2():
+    return utils.smart_cond(batch_position % len(resize_methods) == 2,
+                            lambda: lookup(2), lambda: lookup(3))
+  # NOTE(jsimsa): Unfortunately, we cannot use a single recursive function here
+  # because TF would not be able to construct a finite graph.
+  return resize_method_0()
+def decode_jpeg(image_buffer, scope=None):  # , dtype=tf.float32):
+  """Decode a JPEG string into one 3-D float image Tensor.
+  Args:
+    image_buffer: scalar string Tensor.
+    scope: Optional scope for op_scope.
+  Returns:
+    3-D float Tensor with values ranging from [0, 1).
+  """
+  # with tf.op_scope([image_buffer], scope, 'decode_jpeg'):
+  # with tf.name_scope(scope, 'decode_jpeg', [image_buffer]):
+  with tf.name_scope(scope or 'decode_jpeg'):
+    # Decode the string as an RGB JPEG.
+    # Note that the resulting image contains an unknown height and width
+    # that is set dynamically by decode_jpeg. In other words, the height
+    # and width of image is unknown at compile-time.
+    image = tf.image.decode_jpeg(image_buffer, channels=3,
+                                 fancy_upscaling=False,
+                                 dct_method='INTEGER_FAST')
+    # image = tf.Print(image, [tf.shape(image)], 'Image shape: ')
+    return image
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+def normalized_image(images):
+  # Rescale from [0, 255] to [0, 2]
+  images = tf.multiply(images, 1. / 127.5)
+  # Rescale to [-1, 1]
+  mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION, value=[1.0] * 3)
+  return tf.subtract(images, 1.0)
+def eval_image(image,
+               height,
+               width,
+               batch_position,
+               resize_method,
+               summary_verbosity=0):
+  """Get the image for model evaluation.
+  We preprocess the image simiarly to Slim, see
+  https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/vgg_preprocessing.py
+  Validation images do not have bounding boxes, so to crop the image, we first
+  resize the image such that the aspect ratio is maintained and the resized
+  height and width are both at least 1.145 times `height` and `width`
+  respectively. Then, we do a central crop to size (`height`, `width`).
+  Args:
+    image: 3-D float Tensor representing the image.
+    height: The height of the image that will be returned.
+    width: The width of the image that will be returned.
+    batch_position: position of the image in a batch, which affects how images
+      are distorted and resized. NOTE: this argument can be an integer or a
+      tensor
+    resize_method: one of the strings 'round_robin', 'nearest', 'bilinear',
+      'bicubic', or 'area'.
+    summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both
+      summaries and checkpoints.
+  Returns:
+    An image of size (output_height, output_width, 3) that is resized and
+    cropped as described above.
+  """
+  # TODO(reedwm): Currently we resize then crop. Investigate if it's faster to
+  # crop then resize.
+  with tf.name_scope('eval_image'):
+    if summary_verbosity >= 3:
+      tf.summary.image(
+          'original_image', tf.expand_dims(image, 0))
+    shape = tf.shape(image)
+    image_height = shape[0]
+    image_width = shape[1]
+    image_height_float = tf.cast(image_height, tf.float32)
+    image_width_float = tf.cast(image_width, tf.float32)
+    # This value is chosen so that in resnet, images are cropped to a size of
+    # 256 x 256, which matches what other implementations do. The final image
+    # size for resnet is 224 x 224, and floor(224 * 1.145) = 256.
+    scale_factor = 1.145
+    # Compute resize_height and resize_width to be the minimum values such that
+    #   1. The aspect ratio is maintained (i.e. resize_height / resize_width is
+    #      image_height / image_width), and
+    #   2. resize_height >= height * `scale_factor`, and
+    #   3. resize_width >= width * `scale_factor`
+    max_ratio = tf.maximum(height / image_height_float,
+                           width / image_width_float)
+    resize_height = tf.cast(image_height_float * max_ratio * scale_factor,
+                            tf.int32)
+    resize_width = tf.cast(image_width_float * max_ratio * scale_factor,
+                           tf.int32)
+    mlperf.logger.log_input_resize_aspect_preserving(height, width,
+                                                     scale_factor)
+    # Resize the image to shape (`resize_height`, `resize_width`)
+    image_resize_method = get_image_resize_method(resize_method, batch_position)
+    distorted_image = tf.image.resize_images(image,
+                                             [resize_height, resize_width],
+                                             image_resize_method,
+                                             align_corners=False)
+    # Do a central crop of the image to size (height, width).
+    # MLPerf requires us to log (height, width) with two different keys.
+    mlperf.logger.log(key=mlperf.tags.INPUT_CENTRAL_CROP, value=[height, width])
+    mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width])
+    total_crop_height = (resize_height - height)
+    crop_top = total_crop_height // 2
+    total_crop_width = (resize_width - width)
+    crop_left = total_crop_width // 2
+    distorted_image = tf.slice(distorted_image, [crop_top, crop_left, 0],
+                               [height, width, 3])
+    distorted_image.set_shape([height, width, 3])
+    if summary_verbosity >= 3:
+      tf.summary.image(
+          'cropped_resized_image', tf.expand_dims(distorted_image, 0))
+    image = distorted_image
+  return image
+def train_image(image_buffer,
+                height,
+                width,
+                bbox,
+                batch_position,
+                resize_method,
+                distortions,
+                scope=None,
+                summary_verbosity=0,
+                distort_color_in_yiq=False,
+                fuse_decode_and_crop=False):
+  """Distort one image for training a network.
+  Distorting images provides a useful technique for augmenting the data
+  set during training in order to make the network invariant to aspects
+  of the image that do not effect the label.
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    height: integer
+    width: integer
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged
+      as [ymin, xmin, ymax, xmax].
+    batch_position: position of the image in a batch, which affects how images
+      are distorted and resized. NOTE: this argument can be an integer or a
+      tensor
+    resize_method: round_robin, nearest, bilinear, bicubic, or area.
+    distortions: If true, apply full distortions for image colors.
+    scope: Optional scope for op_scope.
+    summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both
+      summaries and checkpoints.
+    distort_color_in_yiq: distort color of input images in YIQ space.
+    fuse_decode_and_crop: fuse the decode/crop operation.
+  Returns:
+    3-D float Tensor of distorted image used for training.
+  """
+  # with tf.op_scope([image, height, width, bbox], scope, 'distort_image'):
+  # with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]):
+  with tf.name_scope(scope or 'distort_image'):
+    # A large fraction of image datasets contain a human-annotated bounding box
+    # delineating the region of the image containing the object of interest.  We
+    # choose to create a new bounding box for the object which is a randomly
+    # distorted version of the human-annotated bounding box that obeys an
+    # allowed range of aspect ratios, sizes and overlap with the human-annotated
+    # bounding box. If no box is supplied, then we assume the bounding box is
+    # the entire image.
+    min_object_covered = 0.1
+    aspect_ratio_range = [0.75, 1.33]
+    area_range = [0.05, 1.0]
+    max_attempts = 100
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MIN_OBJ_COV,
+                      value=min_object_covered)
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_RATIO_RANGE,
+                      value=aspect_ratio_range)
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_AREA_RANGE,
+                      value=area_range)
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MAX_ATTEMPTS,
+                      value=max_attempts)
+    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+        tf.image.extract_jpeg_shape(image_buffer),
+        bounding_boxes=bbox,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=True)
+    bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
+    if summary_verbosity >= 3:
+      image = tf.image.decode_jpeg(image_buffer, channels=3,
+                                   dct_method='INTEGER_FAST')
+      image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+      image_with_distorted_box = tf.image.draw_bounding_boxes(
+          tf.expand_dims(image, 0), distort_bbox)
+      tf.summary.image(
+          'images_with_distorted_bounding_box',
+          image_with_distorted_box)
+    # Crop the image to the specified bounding box.
+    if fuse_decode_and_crop:
+      offset_y, offset_x, _ = tf.unstack(bbox_begin)
+      target_height, target_width, _ = tf.unstack(bbox_size)
+      crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+      image = tf.image.decode_and_crop_jpeg(
+          image_buffer, crop_window, channels=3)
+    else:
+      image = tf.image.decode_jpeg(image_buffer, channels=3,
+                                   dct_method='INTEGER_FAST')
+      image = tf.slice(image, bbox_begin, bbox_size)
+    mlperf.logger.log(key=mlperf.tags.INPUT_RANDOM_FLIP)
+    distorted_image = tf.image.random_flip_left_right(image)
+    # This resizing operation may distort the images because the aspect
+    # ratio is not respected.
+    mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width])
+    image_resize_method = get_image_resize_method(resize_method, batch_position)
+    distorted_image = tf.image.resize_images(
+        distorted_image, [height, width],
+        image_resize_method,
+        align_corners=False)
+    # Restore the shape since the dynamic slice based upon the bbox_size loses
+    # the third dimension.
+    distorted_image.set_shape([height, width, 3])
+    if summary_verbosity >= 3:
+      tf.summary.image('cropped_resized_maybe_flipped_image',
+                       tf.expand_dims(distorted_image, 0))
+    if distortions:
+      distorted_image = tf.cast(distorted_image, dtype=tf.float32)
+      # Images values are expected to be in [0,1] for color distortion.
+      distorted_image /= 255.
+      # Randomly distort the colors.
+      distorted_image = distort_color(distorted_image, batch_position,
+                                      distort_color_in_yiq=distort_color_in_yiq)
+      # Note: This ensures the scaling matches the output of eval_image
+      distorted_image *= 255
+    if summary_verbosity >= 3:
+      tf.summary.image(
+          'final_distorted_image',
+          tf.expand_dims(distorted_image, 0))
+    return distorted_image
+def distort_color(image, batch_position=0, distort_color_in_yiq=False,
+                  scope=None):
+  """Distort the color of the image.
+  Each color distortion is non-commutative and thus ordering of the color ops
+  matters. Ideally we would randomly permute the ordering of the color ops.
+  Rather then adding that level of complication, we select a distinct ordering
+  of color ops based on the position of the image in a batch.
+  Args:
+    image: float32 Tensor containing single image. Tensor values should be in
+      range [0, 1].
+    batch_position: the position of the image in a batch. NOTE: this argument
+      can be an integer or a tensor
+    distort_color_in_yiq: distort color of input images in YIQ space.
+    scope: Optional scope for op_scope.
+  Returns:
+    color-distorted image
+  """
+  if distort_color_in_yiq:
+    try:
+      from tensorflow.contrib.image.python.ops import distort_image_ops  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ValueError(
+          'In TF2, you cannot pass --distortions unless you also pass '
+          '--nodistort_color_in_yiq. This is because the random_hsv_in_yiq was '
+          'removed in TF2. --distortions does not improve accuracy on resnet '
+          'so it is not recommended. --nodistort_color_in_yiq also has no '
+          'impact on accuracy, but may hurt performance.')
+  with tf.name_scope(scope or 'distort_color'):
+    def distort_fn_0(image=image):
+      """Variant 0 of distort function."""
+      image = tf.image.random_brightness(image, max_delta=32. / 255.)
+      if distort_color_in_yiq:
+        image = distort_image_ops.random_hsv_in_yiq(
+            image, lower_saturation=0.5, upper_saturation=1.5,
+            max_delta_hue=0.2 * math.pi)
+      else:
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_hue(image, max_delta=0.2)
+      image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+      return image
+    def distort_fn_1(image=image):
+      """Variant 1 of distort function."""
+      image = tf.image.random_brightness(image, max_delta=32. / 255.)
+      image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+      if distort_color_in_yiq:
+        image = distort_image_ops.random_hsv_in_yiq(
+            image, lower_saturation=0.5, upper_saturation=1.5,
+            max_delta_hue=0.2 * math.pi)
+      else:
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_hue(image, max_delta=0.2)
+      return image
+    image = utils.smart_cond(batch_position % 2 == 0, distort_fn_0,
+                             distort_fn_1)
+    # The random_* ops do not necessarily clamp.
+    image = tf.clip_by_value(image, 0.0, 1.0)
+    return image
+class InputPreprocessor(object):
+  """Base class for all model preprocessors."""
+  def __init__(self, batch_size, output_shapes):
+    self.batch_size = batch_size
+    self.output_shapes = output_shapes
+  def supports_datasets(self):
+    """Whether this preprocessor supports dataset."""
+    return False
+  def minibatch(self, dataset, subset, params, shift_ratio=-1):
+    """Returns tensors representing a minibatch of all the input."""
+    raise NotImplementedError('Must be implemented by subclass.')
+  # The methods added below are only supported/used if supports_datasets()
+  # returns True.
+  # TODO(laigd): refactor benchmark_cnn.py and put the logic of
+  # _build_input_processing() into InputPreprocessor.
+  def parse_and_preprocess(self, value, batch_position):
+    """Function to parse and preprocess an Example proto in input pipeline."""
+    raise NotImplementedError('Must be implemented by subclass.')
+  # TODO(laigd): figure out how to remove these parameters, since the
+  # preprocessor itself has self.batch_size, self.num_splits, etc defined.
+  def build_multi_device_iterator(self, batch_size, num_splits, cpu_device,
+                                  params, gpu_devices, dataset, doing_eval):
+    """Creates a MultiDeviceIterator."""
+    assert self.supports_datasets()
+    assert num_splits == len(gpu_devices)
+    with tf.name_scope('batch_processing'):
+      if doing_eval:
+        subset = 'validation'
+      else:
+        subset = 'train'
+      batch_size_per_split = batch_size // num_splits
+      ds = self.create_dataset(
+          batch_size,
+          num_splits,
+          batch_size_per_split,
+          dataset,
+          subset,
+          train=(not doing_eval),
+          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+          num_threads=params.datasets_num_private_threads,
+          datasets_use_caching=params.datasets_use_caching,
+          datasets_parallel_interleave_cycle_length=(
+              params.datasets_parallel_interleave_cycle_length),
+          datasets_sloppy_parallel_interleave=(
+              params.datasets_sloppy_parallel_interleave),
+          datasets_parallel_interleave_prefetch=(
+              params.datasets_parallel_interleave_prefetch))
+      multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          ds,
+          gpu_devices,
+          source_device=cpu_device,
+          max_buffer_size=params.multi_device_iterator_max_buffer_size)
+      tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
+                           multi_device_iterator.initializer)
+      return multi_device_iterator
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    raise NotImplementedError('Must be implemented by subclass.')
+  def create_iterator(self, ds):
+    ds_iterator = tf.data.make_initializable_iterator(ds)
+    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
+                         ds_iterator.initializer)
+    return ds_iterator
+  def minibatch_fn(self, batch_size, model_input_shapes, num_splits,
+                   dataset, subset, train, datasets_repeat_cached_sample,
+                   num_threads, datasets_use_caching,
+                   datasets_parallel_interleave_cycle_length,
+                   datasets_sloppy_parallel_interleave,
+                   datasets_parallel_interleave_prefetch):
+    """Returns a function and list of args for the fn to create a minibatch."""
+    assert self.supports_datasets()
+    batch_size_per_split = batch_size // num_splits
+    assert batch_size_per_split == model_input_shapes[0][0]
+    with tf.name_scope('batch_processing'):
+      ds = self.create_dataset(batch_size, num_splits, batch_size_per_split,
+                               dataset, subset, train,
+                               datasets_repeat_cached_sample, num_threads,
+                               datasets_use_caching,
+                               datasets_parallel_interleave_cycle_length,
+                               datasets_sloppy_parallel_interleave,
+                               datasets_parallel_interleave_prefetch)
+      ds_iterator = self.create_iterator(ds)
+      ds_iterator_string_handle = ds_iterator.string_handle()
+      @function.Defun(tf.string)
+      def _fn(h):
+        remote_iterator = tf.data.Iterator.from_string_handle(
+            h, ds_iterator.output_types, ds_iterator.output_shapes)
+        input_list = remote_iterator.get_next()
+        reshaped_input_list = [
+            tf.reshape(input_list[i], shape=model_input_shapes[i])
+            for i in range(len(input_list))
+        ]
+        return reshaped_input_list
+      return _fn, [ds_iterator_string_handle]
+class BaseImagePreprocessor(InputPreprocessor):
+  """Base class for all image model preprocessors."""
+  def __init__(self,
+               batch_size,
+               output_shapes,
+               num_splits,
+               dtype,
+               train,
+               distortions,
+               resize_method,
+               shift_ratio=-1,
+               summary_verbosity=0,
+               distort_color_in_yiq=True,
+               fuse_decode_and_crop=True,
+               match_mlperf=False):
+    super(BaseImagePreprocessor, self).__init__(batch_size, output_shapes)
+    image_shape = output_shapes[0]
+    # image_shape is in form (batch_size, height, width, depth)
+    self.height = image_shape[1]
+    self.width = image_shape[2]
+    self.depth = image_shape[3]
+    self.num_splits = num_splits
+    self.dtype = dtype
+    self.train = train
+    self.resize_method = resize_method
+    self.shift_ratio = shift_ratio
+    self.distortions = distortions
+    self.distort_color_in_yiq = distort_color_in_yiq
+    self.fuse_decode_and_crop = fuse_decode_and_crop
+    if self.batch_size % self.num_splits != 0:
+      raise ValueError(
+          ('batch_size must be a multiple of num_splits: '
+           'batch_size %d, num_splits: %d') %
+          (self.batch_size, self.num_splits))
+    self.batch_size_per_split = self.batch_size // self.num_splits
+    self.summary_verbosity = summary_verbosity
+    self.match_mlperf = match_mlperf
+  def parse_and_preprocess(self, value, batch_position):
+    assert self.supports_datasets()
+    image_buffer, label_index, bbox, _ = parse_example_proto(value)
+    if self.match_mlperf:
+      bbox = tf.zeros((1, 0, 4), dtype=bbox.dtype)
+      mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=False)
+    else:
+      mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=True)
+    image = self.preprocess(image_buffer, bbox, batch_position)
+    return (image, label_index)
+  def preprocess(self, image_buffer, bbox, batch_position):
+    raise NotImplementedError('Must be implemented by subclass.')
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    assert self.supports_datasets()
+    glob_pattern = dataset.tf_record_pattern(subset)
+    file_names = gfile.Glob(glob_pattern)
+    if not file_names:
+      raise ValueError('Found no files in --data_dir matching: {}'
+                       .format(glob_pattern))
+    ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train)
+    ds = ds.apply(
+        tf.data.experimental.parallel_interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=datasets_parallel_interleave_cycle_length or 10,
+            sloppy=datasets_sloppy_parallel_interleave,
+            prefetch_input_elements=datasets_parallel_interleave_prefetch))
+    if datasets_repeat_cached_sample:
+      # Repeat a single sample element indefinitely to emulate memory-speed IO.
+      ds = ds.take(1).cache().repeat()
+    counter = tf.data.Dataset.range(batch_size)
+    counter = counter.repeat()
+    ds = tf.data.Dataset.zip((ds, counter))
+    ds = ds.prefetch(buffer_size=batch_size)
+    if datasets_use_caching:
+      ds = ds.cache()
+    if train:
+      buffer_size = 10000
+      mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=buffer_size)
+      ds = ds.apply(
+          tf.data.experimental.shuffle_and_repeat(buffer_size=buffer_size))
+    else:
+      ds = ds.repeat()
+    ds = ds.apply(
+        tf.data.experimental.map_and_batch(
+            map_func=self.parse_and_preprocess,
+            batch_size=batch_size_per_split,
+            num_parallel_batches=num_splits))
+    ds = ds.prefetch(buffer_size=num_splits)
+    if num_threads:
+      options = tf.data.Options()
+      options.experimental_threading.private_threadpool_size = num_threads
+      ds = ds.with_options(options)
+    return ds
+class RecordInputImagePreprocessor(BaseImagePreprocessor):
+  """Preprocessor for images with RecordInput format."""
+  def preprocess(self, image_buffer, bbox, batch_position):
+    """Preprocessing image_buffer as a function of its batch position."""
+    if self.train:
+      image = train_image(image_buffer, self.height, self.width, bbox,
+                          batch_position, self.resize_method, self.distortions,
+                          None, summary_verbosity=self.summary_verbosity,
+                          distort_color_in_yiq=self.distort_color_in_yiq,
+                          fuse_decode_and_crop=self.fuse_decode_and_crop)
+    else:
+      image = tf.image.decode_jpeg(
+          image_buffer, channels=3, dct_method='INTEGER_FAST')
+      image = eval_image(image, self.height, self.width, batch_position,
+                         self.resize_method,
+                         summary_verbosity=self.summary_verbosity)
+    # Note: image is now float32 [height,width,3] with range [0, 255]
+    # image = tf.cast(image, tf.uint8) # HACK TESTING
+    if self.match_mlperf:
+      mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION,
+                        value=_CHANNEL_MEANS)
+      normalized = image - _CHANNEL_MEANS
+    else:
+      normalized = normalized_image(image)
+    return tf.cast(normalized, self.dtype)
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=-1):
+    if shift_ratio < 0:
+      shift_ratio = self.shift_ratio
+    with tf.name_scope('batch_processing'):
+      # Build final results per split.
+      images = [[] for _ in range(self.num_splits)]
+      labels = [[] for _ in range(self.num_splits)]
+      if params.use_datasets:
+        ds = self.create_dataset(
+            self.batch_size, self.num_splits, self.batch_size_per_split,
+            dataset, subset, self.train,
+            datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+            num_threads=params.datasets_num_private_threads,
+            datasets_use_caching=params.datasets_use_caching,
+            datasets_parallel_interleave_cycle_length=(
+                params.datasets_parallel_interleave_cycle_length),
+            datasets_sloppy_parallel_interleave=(
+                params.datasets_sloppy_parallel_interleave),
+            datasets_parallel_interleave_prefetch=(
+                params.datasets_parallel_interleave_prefetch))
+        ds_iterator = self.create_iterator(ds)
+        for d in xrange(self.num_splits):
+          images[d], labels[d] = ds_iterator.get_next()
+      # TODO(laigd): consider removing the --use_datasets option, it should
+      # always use datasets.
+      else:
+        record_input = data_flow_ops.RecordInput(
+            file_pattern=dataset.tf_record_pattern(subset),
+            seed=301,
+            parallelism=64,
+            buffer_size=10000,
+            batch_size=self.batch_size,
+            shift_ratio=shift_ratio,
+            name='record_input')
+        records = record_input.get_yield_op()
+        records = tf.split(records, self.batch_size, 0)
+        records = [tf.reshape(record, []) for record in records]
+        for idx in xrange(self.batch_size):
+          value = records[idx]
+          (image, label) = self.parse_and_preprocess(value, idx)
+          split_index = idx % self.num_splits
+          labels[split_index].append(label)
+          images[split_index].append(image)
+      for split_index in xrange(self.num_splits):
+        if not params.use_datasets:
+          images[split_index] = tf.parallel_stack(images[split_index])
+          labels[split_index] = tf.concat(labels[split_index], 0)
+        images[split_index] = tf.reshape(
+            images[split_index],
+            shape=[self.batch_size_per_split, self.height, self.width,
+                   self.depth])
+        labels[split_index] = tf.reshape(labels[split_index],
+                                         [self.batch_size_per_split])
+      return images, labels
+  def supports_datasets(self):
+    return True
+class ImagenetPreprocessor(RecordInputImagePreprocessor):
+  def preprocess(self, image_buffer, bbox, batch_position):
+    # pylint: disable=g-import-not-at-top
+    try:
+      from official.r1.resnet.imagenet_preprocessing import preprocess_image
+    except ImportError:
+      tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH.')
+      raise
+    if self.train:
+      image = preprocess_image(
+          image_buffer, bbox, self.height, self.width, self.depth,
+          is_training=True)
+    else:
+      image = preprocess_image(
+          image_buffer, bbox, self.height, self.width, self.depth,
+          is_training=False)
+    return tf.cast(image, self.dtype)
+class Cifar10ImagePreprocessor(BaseImagePreprocessor):
+  """Preprocessor for Cifar10 input images."""
+  def _distort_image(self, image):
+    """Distort one image for training a network.
+    Adopted the standard data augmentation scheme that is widely used for
+    this dataset: the images are first zero-padded with 4 pixels on each side,
+    then randomly cropped to again produce distorted images; half of the images
+    are then horizontally mirrored.
+    Args:
+      image: input image.
+    Returns:
+      distorted image.
+    """
+    image = tf.image.resize_image_with_crop_or_pad(
+        image, self.height + 8, self.width + 8)
+    distorted_image = tf.random_crop(image,
+                                     [self.height, self.width, self.depth])
+    # Randomly flip the image horizontally.
+    distorted_image = tf.image.random_flip_left_right(distorted_image)
+    if self.summary_verbosity >= 3:
+      tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0))
+    return distorted_image
+  def _eval_image(self, image):
+    """Get the image for model evaluation."""
+    distorted_image = tf.image.resize_image_with_crop_or_pad(
+        image, self.width, self.height)
+    if self.summary_verbosity >= 3:
+      tf.summary.image('cropped.image', tf.expand_dims(distorted_image, 0))
+    return distorted_image
+  def preprocess(self, raw_image):
+    """Preprocessing raw image."""
+    if self.summary_verbosity >= 3:
+      tf.summary.image('raw.image', tf.expand_dims(raw_image, 0))
+    if self.train and self.distortions:
+      image = self._distort_image(raw_image)
+    else:
+      image = self._eval_image(raw_image)
+    normalized = normalized_image(image)
+    return tf.cast(normalized, self.dtype)
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=-1):
+    # TODO(jsimsa): Implement datasets code path
+    del shift_ratio, params
+    with tf.name_scope('batch_processing'):
+      all_images, all_labels = dataset.read_data_files(subset)
+      all_images = tf.constant(all_images)
+      all_labels = tf.constant(all_labels)
+      input_image, input_label = tf.train.slice_input_producer(
+          [all_images, all_labels])
+      input_image = tf.cast(input_image, self.dtype)
+      input_label = tf.cast(input_label, tf.int32)
+      # Ensure that the random shuffling has good mixing properties.
+      min_fraction_of_examples_in_queue = 0.4
+      min_queue_examples = int(dataset.num_examples_per_epoch(subset) *
+                               min_fraction_of_examples_in_queue)
+      raw_images, raw_labels = tf.train.shuffle_batch(
+          [input_image, input_label], batch_size=self.batch_size,
+          capacity=min_queue_examples + 3 * self.batch_size,
+          min_after_dequeue=min_queue_examples)
+      images = [[] for i in range(self.num_splits)]
+      labels = [[] for i in range(self.num_splits)]
+      # Create a list of size batch_size, each containing one image of the
+      # batch. Without the unstack call, raw_images[i] would still access the
+      # same image via a strided_slice op, but would be slower.
+      raw_images = tf.unstack(raw_images, axis=0)
+      raw_labels = tf.unstack(raw_labels, axis=0)
+      for i in xrange(self.batch_size):
+        split_index = i % self.num_splits
+        # The raw image read from data has the format [depth, height, width]
+        # reshape to the format returned by minibatch.
+        raw_image = tf.reshape(raw_images[i],
+                               [dataset.depth, dataset.height, dataset.width])
+        raw_image = tf.transpose(raw_image, [1, 2, 0])
+        image = self.preprocess(raw_image)
+        images[split_index].append(image)
+        labels[split_index].append(raw_labels[i])
+      for split_index in xrange(self.num_splits):
+        images[split_index] = tf.parallel_stack(images[split_index])
+        labels[split_index] = tf.parallel_stack(labels[split_index])
+      return images, labels
+class COCOPreprocessor(BaseImagePreprocessor):
+  """Preprocessor for COCO dataset input images, boxes, and labels."""
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=-1):
+    del shift_ratio  # Not used when using datasets instead of data_flow_ops
+    with tf.name_scope('batch_processing'):
+      ds = self.create_dataset(
+          batch_size=self.batch_size,
+          num_splits=self.num_splits,
+          batch_size_per_split=self.batch_size_per_split,
+          dataset=dataset,
+          subset=subset,
+          train=self.train,
+          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+          num_threads=params.datasets_num_private_threads,
+          datasets_use_caching=params.datasets_use_caching,
+          datasets_parallel_interleave_cycle_length=(
+              params.datasets_parallel_interleave_cycle_length),
+          datasets_sloppy_parallel_interleave=(
+              params.datasets_sloppy_parallel_interleave),
+          datasets_parallel_interleave_prefetch=(
+              params.datasets_parallel_interleave_prefetch))
+      ds_iterator = self.create_iterator(ds)
+      # Training data: 4 tuple
+      # Validation data: 5 tuple
+      # See get_input_shapes in models/ssd_model.py for details.
+      input_len = 4 if subset == 'train' else 5
+      input_lists = [[None for _ in range(self.num_splits)]
+                     for _ in range(input_len)]
+      for d in xrange(self.num_splits):
+        input_list = ds_iterator.get_next()
+        for i in range(input_len):
+          input_lists[i][d] = input_list[i]
+      return input_lists
+  def preprocess(self, data):
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+      import ssd_constants  # pylint: disable=g-import-not-at-top
+      from object_detection.core import preprocessor  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation')
+    image_buffer = data['image_buffer']
+    boxes = data['groundtruth_boxes']
+    classes = tf.reshape(data['groundtruth_classes'], [-1, 1])
+    source_id = tf.string_to_number(data['source_id'])
+    raw_shape = data['raw_shape']
+    ssd_encoder = ssd_dataloader.Encoder()
+    # Only 80 of the 90 COCO classes are used.
+    class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
+    classes = tf.gather(class_map, classes)
+    classes = tf.cast(classes, dtype=tf.float32)
+    if self.train:
+      image, boxes, classes = ssd_dataloader.ssd_decode_and_crop(
+          image_buffer, boxes, classes, raw_shape)
+      # ssd_crop resizes and returns image of dtype float32 and does not change
+      # its range (i.e., value in between 0--255). Divide by 255. converts it
+      # to [0, 1] range. Not doing this before cropping to avoid dtype cast
+      # (which incurs additional memory copy).
+      image /= 255.
+      image, boxes = preprocessor.random_horizontal_flip(
+          image=image, boxes=boxes)
+      # Random horizontal flip probability is 50%
+      # See https://github.com/tensorflow/models/blob/master/research/object_detection/core/preprocessor.py  # pylint: disable=line-too-long
+      mlperf.logger.log(key=mlperf.tags.RANDOM_FLIP_PROBABILITY, value=0.5)
+      image = tf.cast(image, self.dtype)
+      encoded_returns = ssd_encoder.encode_labels(boxes, classes)
+      encoded_classes, encoded_boxes, num_matched_boxes = encoded_returns
+      # Shape of image: [width, height, channel]
+      # Shape of encoded_boxes: [NUM_SSD_BOXES, 4]
+      # Shape of encoded_classes: [NUM_SSD_BOXES, 1]
+      # Shape of num_matched_boxes: [1]
+      return (image, encoded_boxes, encoded_classes, num_matched_boxes)
+    else:
+      image = tf.image.decode_jpeg(image_buffer)
+      image = tf.image.resize_images(
+          image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))
+      # resize_image returns image of dtype float32 and does not change its
+      # range. Divide by 255 to convert image to [0, 1] range.
+      image /= 255.
+      image = ssd_dataloader.normalize_image(image)
+      image = tf.cast(image, self.dtype)
+      def trim_and_pad(inp_tensor):
+        """Limit the number of boxes, and pad if necessary."""
+        inp_tensor = inp_tensor[:ssd_constants.MAX_NUM_EVAL_BOXES]
+        num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0]
+        inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
+        return tf.reshape(inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES,
+                                       inp_tensor.get_shape()[1]])
+      boxes, classes = trim_and_pad(boxes), trim_and_pad(classes)
+      # Shape of boxes: [MAX_NUM_EVAL_BOXES, 4]
+      # Shape of classes: [MAX_NUM_EVAL_BOXES, 1]
+      # Shape of source_id: [] (scalar tensor)
+      # Shape of raw_shape: [3]
+      return (image, boxes, classes, source_id, raw_shape)
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation')
+    assert self.supports_datasets()
+    glob_pattern = dataset.tf_record_pattern(subset)
+    ds = tf.data.TFRecordDataset.list_files(glob_pattern, shuffle=train)
+    # TODO(haoyuzhang): Enable map+filter fusion after cl/218399112 in release
+    # options = tf.data.Options()
+    # options.experimental_optimization = tf.data.experimental.OptimizationOptions()  # pylint: disable=line-too-long
+    # options.experimental_optimization.map_and_filter_fusion = True
+    # ds = ds.with_options(options)
+    ds = ds.apply(
+        tf.data.experimental.parallel_interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=datasets_parallel_interleave_cycle_length or 10,
+            sloppy=datasets_sloppy_parallel_interleave))
+    mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
+    if datasets_repeat_cached_sample:
+      # Repeat a single sample element indefinitely to emulate memory-speed IO.
+      ds = ds.take(1).cache().repeat()
+    ds = ds.prefetch(buffer_size=batch_size)
+    if datasets_use_caching:
+      ds = ds.cache()
+    if train:
+      ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000))
+      mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=10000)
+      mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
+    else:
+      ds = ds.repeat()
+    ds = ds.map(ssd_dataloader.ssd_parse_example_proto, num_parallel_calls=64)
+    ds = ds.filter(
+        lambda data: tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0))
+    ds = ds.apply(
+        tf.data.experimental.map_and_batch(
+            map_func=self.preprocess,
+            batch_size=batch_size_per_split,
+            num_parallel_batches=num_splits,
+            drop_remainder=train))
+    ds = ds.prefetch(buffer_size=num_splits)
+    if num_threads:
+      options = tf.data.Options()
+      options.experimental_threading.private_threadpool_size = num_threads
+      ds = ds.with_options(options)
+    return ds
+  def supports_datasets(self):
+    return True
+class TestImagePreprocessor(BaseImagePreprocessor):
+  """Preprocessor used for testing.
+  set_fake_data() sets which images and labels will be output by minibatch(),
+  and must be called before minibatch(). This allows tests to easily specify
+  a set of images to use for training, without having to create any files.
+  Queue runners must be started for this preprocessor to work.
+  """
+  def __init__(self,
+               batch_size,
+               output_shapes,
+               num_splits,
+               dtype,
+               train=None,
+               distortions=None,
+               resize_method=None,
+               shift_ratio=0,
+               summary_verbosity=0,
+               distort_color_in_yiq=False,
+               fuse_decode_and_crop=False,
+               match_mlperf=False):
+    super(TestImagePreprocessor, self).__init__(
+        batch_size, output_shapes, num_splits, dtype, train, distortions,
+        resize_method, shift_ratio, summary_verbosity=summary_verbosity,
+        distort_color_in_yiq=distort_color_in_yiq,
+        fuse_decode_and_crop=fuse_decode_and_crop, match_mlperf=match_mlperf)
+    self.expected_subset = None
+  def set_fake_data(self, fake_images, fake_labels):
+    assert len(fake_images.shape) == 4
+    assert len(fake_labels.shape) == 1
+    num_images = fake_images.shape[0]
+    assert num_images == fake_labels.shape[0]
+    assert num_images % self.batch_size == 0
+    self.fake_images = fake_images
+    self.fake_labels = fake_labels
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=0):
+    """Get test image batches."""
+    del dataset, params
+    if (not hasattr(self, 'fake_images') or
+        not hasattr(self, 'fake_labels')):
+      raise ValueError('Must call set_fake_data() before calling minibatch '
+                       'on TestImagePreprocessor')
+    if self.expected_subset is not None:
+      assert subset == self.expected_subset
+    shift_ratio = shift_ratio or self.shift_ratio
+    fake_images = cnn_util.roll_numpy_batches(self.fake_images, self.batch_size,
+                                              shift_ratio)
+    fake_labels = cnn_util.roll_numpy_batches(self.fake_labels, self.batch_size,
+                                              shift_ratio)
+    with tf.name_scope('batch_processing'):
+      image_slice, label_slice = tf.train.slice_input_producer(
+          [fake_images, fake_labels],
+          shuffle=False,
+          name='image_slice')
+      raw_images, raw_labels = tf.train.batch(
+          [image_slice, label_slice], batch_size=self.batch_size,
+          name='image_batch')
+      images = [[] for _ in range(self.num_splits)]
+      labels = [[] for _ in range(self.num_splits)]
+      for i in xrange(self.batch_size):
+        split_index = i % self.num_splits
+        raw_image = tf.cast(raw_images[i], self.dtype)
+        images[split_index].append(raw_image)
+        labels[split_index].append(raw_labels[i])
+      for split_index in xrange(self.num_splits):
+        images[split_index] = tf.parallel_stack(images[split_index])
+        labels[split_index] = tf.parallel_stack(labels[split_index])
+      normalized = [normalized_image(part) for part in images]
+      return [[tf.cast(part, self.dtype) for part in normalized], labels]
+class LibrispeechPreprocessor(InputPreprocessor):
+  """Preprocessor for librispeech class for all image model preprocessors."""
+  def __init__(self, batch_size, output_shapes, num_splits, dtype, train,
+               **kwargs):
+    del kwargs
+    super(LibrispeechPreprocessor, self).__init__(batch_size, output_shapes)
+    self.num_splits = num_splits
+    self.dtype = dtype
+    self.is_train = train
+    if self.batch_size % self.num_splits != 0:
+      raise ValueError(('batch_size must be a multiple of num_splits: '
+                        'batch_size %d, num_splits: %d') % (self.batch_size,
+                                                            self.num_splits))
+    self.batch_size_per_split = self.batch_size // self.num_splits
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    # TODO(laigd): currently the only difference between this and the one in
+    # BaseImagePreprocessor is, this uses map() and padded_batch() while the
+    # latter uses tf.data.experimental.map_and_batch(). Try to merge them.
+    assert self.supports_datasets()
+    glob_pattern = dataset.tf_record_pattern(subset)
+    file_names = gfile.Glob(glob_pattern)
+    if not file_names:
+      raise ValueError('Found no files in --data_dir matching: {}'
+                       .format(glob_pattern))
+    ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train)
+    ds = ds.apply(
+        tf.data.experimental.parallel_interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=datasets_parallel_interleave_cycle_length or 10,
+            sloppy=datasets_sloppy_parallel_interleave,
+            prefetch_input_elements=datasets_parallel_interleave_prefetch))
+    if datasets_repeat_cached_sample:
+      # Repeat a single sample element indefinitely to emulate memory-speed IO.
+      ds = ds.take(1).cache().repeat()
+    counter = tf.data.Dataset.range(batch_size)
+    counter = counter.repeat()
+    ds = tf.data.Dataset.zip((ds, counter))
+    ds = ds.prefetch(buffer_size=batch_size)
+    if datasets_use_caching:
+      ds = ds.cache()
+    if train:
+      ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000))
+    else:
+      ds = ds.repeat()
+    ds = ds.map(map_func=self.parse_and_preprocess,
+                num_parallel_calls=batch_size_per_split*num_splits)
+    ds = ds.padded_batch(
+        batch_size=batch_size_per_split,
+        padded_shapes=tuple([
+            tf.TensorShape(output_shape[1:])
+            for output_shape in self.output_shapes
+        ]),
+        drop_remainder=True)
+    ds = ds.prefetch(buffer_size=num_splits)
+    if num_threads:
+      options = tf.data.Options()
+      options.experimental_threading.private_threadpool_size = num_threads
+      ds = ds.with_options(options)
+    return ds
+  def minibatch(self, dataset, subset, params, shift_ratio=-1):
+    assert params.use_datasets
+    # TODO(laigd): unify this with CNNModel's minibatch()
+    # TODO(laigd): in distributed mode we use shift_ratio so different workers
+    # won't work on same inputs, so we should respect that.
+    del shift_ratio
+    with tf.name_scope('batch_processing'):
+      ds = self.create_dataset(
+          self.batch_size,
+          self.num_splits,
+          self.batch_size_per_split,
+          dataset,
+          subset,
+          self.is_train,
+          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+          num_threads=params.datasets_num_private_threads,
+          datasets_use_caching=params.datasets_use_caching,
+          datasets_parallel_interleave_cycle_length=(
+              params.datasets_parallel_interleave_cycle_length),
+          datasets_sloppy_parallel_interleave=(
+              params.datasets_sloppy_parallel_interleave),
+          datasets_parallel_interleave_prefetch=(
+              params.datasets_parallel_interleave_prefetch))
+      ds_iterator = self.create_iterator(ds)
+      # The four lists are: input spectrogram feature, labels, input lengths,
+      # label lengths
+      input_lists = [[None for _ in range(self.num_splits)] for _ in range(4)]
+      for d in xrange(self.num_splits):
+        input_list = ds_iterator.get_next()
+        for i in range(4):
+          input_lists[i][d] = input_list[i]
+      assert self.output_shapes == [
+          input_lists[i][0].shape.as_list() for i in range(4)
+      ]
+      return tuple(input_lists)
+  def supports_datasets(self):
+    return True
+  def parse_and_preprocess(self, value, batch_position):
+    """Parse an TFRecord."""
+    del batch_position
+    assert self.supports_datasets()
+    context_features = {
+        'labels': tf.VarLenFeature(dtype=tf.int64),
+        'input_length': tf.FixedLenFeature([], dtype=tf.int64),
+        'label_length': tf.FixedLenFeature([], dtype=tf.int64),
+    }
+    sequence_features = {
+        'features': tf.FixedLenSequenceFeature([161], dtype=tf.float32)
+    }
+    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
+        serialized=value,
+        context_features=context_features,
+        sequence_features=sequence_features,
+    )
+    return [
+        # Input
+        tf.expand_dims(sequence_parsed['features'], axis=2),
+        # Label
+        tf.cast(
+            tf.reshape(
+                tf.sparse_tensor_to_dense(context_parsed['labels']), [-1]),
+            dtype=tf.int32),
+        # Input length
+        tf.cast(
+            tf.reshape(context_parsed['input_length'], [1]),
+            dtype=tf.int32),
+        # Label length
+        tf.cast(
+            tf.reshape(context_parsed['label_length'], [1]),
+            dtype=tf.int32),
+    ]
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/run.sh
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/run.sh
+#!/bin/bash
+source /public/home/qianyj/virtualenv/dtk21.10.1/dtk21.10.1_tf1.15/venv/bin/activate
+export ROCM_PATH=/public/home/qianyj/package/dtk-21.10.1/dtk-21.10.1
+export HIP_PATH=${ROCM_PATH}/hip
+export CPACK_INSTLL_PREFIX=$ROCM_PATH
+export AMDGPU_TARGETS="gfx900;gfx906"
+export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hip/bin:$PATH
+export LD_LIBRARY_PATH=${ROCM_PATH}/lib:${ROCM_PATH}/lib64:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=${ROCM_PATH}/hip/lib:${ROCM_PATH}/llvm/lib:$LD_LIBRARY_PATH
+export C_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/llvm/include${C_INCLUDE_PATH:+:${C_INCLUDE_PATH}}
+export CPLUS_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/llvm/include${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}
+export HSA_FORCE_FINE_GRAIN_PCIE=1 
+export MIOPEN_FIND_MODE=3
+export TF_CPP_MIN_VLOG_LEVEL=2
+HIP_VISIBLE_DEVICES=0,1,2,3  numactl --cpunodebind=0,1,2,3 --membind=0,1,2,3   nohup python3 tf_cnn_benchmarks.py --data_format=NCHW --batch_size=128 --model=resnet50 --save_model_steps=20000 --optimizer=momentum --variable_update=replicated  --print_training_accuracy=true  --eval_during_training_every_n_epochs=1  --nodistortions --num_gpus=4 --num_epochs=90 --weight_decay=1e-4 --data_dir=/public/software/apps/DeepLearning/Data/ImageNet-tensorflow/   --use_fp16=False --data_name=imagenet --train_dir=/public/home/qianyj/TF_test/dtk21.10.1/tf1.15/benchmarks-master/scripts/checkpoint   >logfile    2>&1  &
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/run_tests.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/run_tests.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs the tf_cnn_benchmarks tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import unittest
+from absl import app
+from absl import flags as absl_flags
+import tensorflow.compat.v1 as tf
+import all_reduce_benchmark_test
+import allreduce_test
+import benchmark_cnn_distributed_test
+import benchmark_cnn_test
+import cnn_util_test
+import variable_mgr_util_test
+from models import model_config
+# Ideally, we wouldn't need this option, and run both distributed tests and non-
+# distributed tests. But, TensorFlow allocates all the GPU memory by default, so
+# the non-distributed tests allocate all the GPU memory. The distributed tests
+# spawn processes that run TensorFlow, and cannot run if all the GPU memory is
+# already allocated. If a non-distributed test is run, then a distributed test
+# is run in the same process, the distributed test will fail because there is no
+# more GPU memory for the spawned processes to allocate.
+absl_flags.DEFINE_boolean('run_distributed_tests', False,
+                          'If True, run the distributed tests. If False, the'
+                          'non-distributed tests.')
+absl_flags.DEFINE_boolean('full_tests', False,
+                          'If True, all distributed or non-distributed tests '
+                          'are run, which can take hours. If False, only a '
+                          'subset of tests will be run. This subset runs much '
+                          'faster and tests almost all the functionality as '
+                          'the full set of tests, so it is recommended to keep '
+                          'this option set to False.')
+FLAGS = absl_flags.FLAGS
+def main(_):
+  loader = unittest.defaultTestLoader
+  if FLAGS.full_tests:
+    suite = unittest.TestSuite([
+        loader.loadTestsFromModule(allreduce_test),
+        loader.loadTestsFromModule(cnn_util_test),
+        loader.loadTestsFromModule(variable_mgr_util_test),
+        loader.loadTestsFromModule(benchmark_cnn_test),
+        loader.loadTestsFromModule(all_reduce_benchmark_test),
+    ])
+    if model_config.can_import_contrib:
+      from models.tf1_only import nasnet_test  # pylint: disable=g-import-not-at-top
+      suite.addTest(loader.loadTestsFromModule(nasnet_test))
+    dist_suite = unittest.TestSuite([
+        loader.loadTestsFromModule(benchmark_cnn_distributed_test),
+    ])
+  else:
+    suite = unittest.TestSuite([
+        loader.loadTestsFromModule(allreduce_test),
+        loader.loadTestsFromModule(cnn_util_test),
+        loader.loadTestsFromModule(all_reduce_benchmark_test),
+        loader.loadTestsFromModule(variable_mgr_util_test),
+        loader.loadTestsFromTestCase(benchmark_cnn_test.TestAlexnetModel),
+        loader.loadTestsFromTestCase(benchmark_cnn_test.TfCnnBenchmarksTest),
+        loader.loadTestsFromTestCase(benchmark_cnn_test.VariableUpdateTest),
+        loader.loadTestsFromTestCase(
+            benchmark_cnn_test.VariableMgrLocalReplicatedTest),
+    ])
+    dist_suite = unittest.TestSuite([
+        loader.loadTestsFromNames([
+            'benchmark_cnn_distributed_test.DistributedVariableUpdateTest'
+            '.testVarUpdateDefault',
+            'benchmark_cnn_distributed_test.TfCnnBenchmarksDistributedTest'
+            '.testParameterServer',
+        ]),
+    ])
+  if FLAGS.run_distributed_tests:
+    print('Running distributed tests')
+    result = unittest.TextTestRunner(verbosity=2).run(dist_suite)
+  else:
+    print('Running non-distributed tests')
+    result = unittest.TextTestRunner(verbosity=2).run(suite)
+  sys.exit(not result.wasSuccessful())
+if __name__ == '__main__':
+  tf.disable_v2_behavior()
+  app.run(main)
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/ssd_constants.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/ssd_constants.py
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Central location for all constants related to MLPerf SSD."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# ==============================================================================
+# == Model =====================================================================
+# ==============================================================================
+IMAGE_SIZE = 300
+# TODO(taylorrobie): MLPerf uses 80, but COCO documents 90. (RetinaNet uses 90)
+# Update(taylorrobie): Labels > 81 show up in the pipeline. This will need to
+#                      be resolved.
+NUM_CLASSES = 81  # Including "no class". Not all COCO classes are used.
+# Note: Zero is special. (Background class) CLASS_INV_MAP[0] must be zero.
+CLASS_INV_MAP = (
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+    22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+    44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+    64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87,
+    88, 89, 90)
+_MAP = {j: i for i, j in enumerate(CLASS_INV_MAP)}
+CLASS_MAP = tuple(_MAP.get(i, -1) for i in range(max(CLASS_INV_MAP) + 1))
+NUM_SSD_BOXES = 8732
+RESNET_DEPTH = 34
+"""SSD specific"""
+MIN_LEVEL = 3
+MAX_LEVEL = 8
+FEATURE_SIZES = (38, 19, 10, 5, 3, 1)
+STEPS = (8, 16, 32, 64, 100, 300)
+# https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
+SCALES = (21, 45, 99, 153, 207, 261, 315)
+ASPECT_RATIOS = ((2,), (2, 3), (2, 3), (2, 3), (2,), (2,))
+NUM_DEFAULTS = (4, 6, 6, 6, 4, 4)
+NUM_DEFAULTS_BY_LEVEL = {3: 4, 4: 6, 5: 6, 6: 6, 7: 4, 8: 4}
+SCALE_XY = 0.1
+SCALE_HW = 0.2
+BOX_CODER_SCALES = (1 / SCALE_XY, 1 / SCALE_XY, 1 / SCALE_HW, 1 / SCALE_HW)
+MATCH_THRESHOLD = 0.5
+# https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
+NORMALIZATION_MEAN = (0.485, 0.456, 0.406)
+NORMALIZATION_STD = (0.229, 0.224, 0.225)
+# SSD Cropping
+NUM_CROP_PASSES = 50
+CROP_MIN_IOU_CHOICES = (0, 0.1, 0.3, 0.5, 0.7, 0.9)
+P_NO_CROP_PER_PASS = 1 / (len(CROP_MIN_IOU_CHOICES) + 1)
+# Hard example mining
+NEGS_PER_POSITIVE = 3
+# Batch normalization
+BATCH_NORM_DECAY = 0.997
+BATCH_NORM_EPSILON = 1e-4
+# ==============================================================================
+# == Optimizer =================================================================
+# ==============================================================================
+LEARNING_RATE_SCHEDULE = (
+    (0, 1e-3),
+    (160000, 1e-4),
+    (200000, 1e-5),
+)
+MOMENTUM = 0.9
+WEIGHT_DECAY = 5e-4
+# ==============================================================================
+# == Keys ======================================================================
+# ==============================================================================
+BOXES = "boxes"
+CLASSES = "classes"
+NUM_MATCHED_BOXES = "num_matched_boxes"
+IMAGE = "image"
+SOURCE_ID = "source_id"
+RAW_SHAPE = "raw_shape"
+PRED_BOXES = "pred_boxes"
+PRED_SCORES = "pred_scores"
+# ==============================================================================
+# == Evaluation ================================================================
+# ==============================================================================
+# Note: This is based on a batch size of 32
+#   https://github.com/mlperf/reference/blob/master/single_stage_detector/ssd/train.py#L21-L37
+CHECKPOINT_FREQUENCY = 20000
+MAX_NUM_EVAL_BOXES = 200
+OVERLAP_CRITERIA = 0.5  # Used for nonmax supression
+MIN_SCORE = 0.05  # Minimum score to be considered during evaluation.
+DUMMY_SCORE = -1e5  # If no boxes are matched.
+ANNOTATION_FILE = "annotations/instances_val2017.json"
+COCO_NUM_TRAIN_IMAGES = 118287
+COCO_NUM_VAL_IMAGES = 4952
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/ssd_dataloader.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/ssd_dataloader.py
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data loader and processing."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import itertools as it
+import math
+import numpy as np
+import tensorflow.compat.v1 as tf
+from object_detection.box_coders import faster_rcnn_box_coder
+from object_detection.core import box_list
+from object_detection.core import region_similarity_calculator
+from object_detection.core import target_assigner
+from object_detection.matchers import argmax_matcher
+import mlperf
+import ssd_constants
+class DefaultBoxes(object):
+  """Default bounding boxes for 300x300 5 layer SSD.
+  Default bounding boxes generation follows the order of (W, H, anchor_sizes).
+  Therefore, the tensor converted from DefaultBoxes has a shape of
+  [anchor_sizes, H, W, 4]. The last dimension is the box coordinates; 'ltrb'
+  is [ymin, xmin, ymax, xmax] while 'xywh' is [cy, cx, h, w].
+  """
+  def __init__(self):
+    fk = ssd_constants.IMAGE_SIZE / np.array(ssd_constants.STEPS)
+    self.default_boxes = []
+    # size of feature and number of feature
+    for idx, feature_size in enumerate(ssd_constants.FEATURE_SIZES):
+      sk1 = ssd_constants.SCALES[idx] / ssd_constants.IMAGE_SIZE
+      sk2 = ssd_constants.SCALES[idx+1] / ssd_constants.IMAGE_SIZE
+      sk3 = math.sqrt(sk1*sk2)
+      all_sizes = [(sk1, sk1), (sk3, sk3)]
+      for alpha in ssd_constants.ASPECT_RATIOS[idx]:
+        w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha)
+        all_sizes.append((w, h))
+        all_sizes.append((h, w))
+      assert len(all_sizes) == ssd_constants.NUM_DEFAULTS[idx]
+      for w, h in all_sizes:
+        for i, j in it.product(range(feature_size), repeat=2):
+          cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
+          box = tuple(np.clip(k, 0, 1) for k in (cy, cx, h, w))
+          self.default_boxes.append(box)
+    assert len(self.default_boxes) == ssd_constants.NUM_SSD_BOXES
+    mlperf.logger.log(key=mlperf.tags.FEATURE_SIZES,
+                      value=ssd_constants.FEATURE_SIZES)
+    mlperf.logger.log(key=mlperf.tags.STEPS,
+                      value=ssd_constants.STEPS)
+    mlperf.logger.log(key=mlperf.tags.SCALES,
+                      value=ssd_constants.SCALES)
+    mlperf.logger.log(key=mlperf.tags.ASPECT_RATIOS,
+                      value=ssd_constants.ASPECT_RATIOS)
+    mlperf.logger.log(key=mlperf.tags.NUM_DEFAULTS,
+                      value=ssd_constants.NUM_SSD_BOXES)
+    def to_ltrb(cy, cx, h, w):
+      return cy - h / 2, cx - w / 2, cy + h / 2, cx + w / 2
+    # For IoU calculation
+    self.default_boxes_ltrb = tuple(to_ltrb(*i) for i in self.default_boxes)
+  def __call__(self, order='ltrb'):
+    if order == 'ltrb': return self.default_boxes_ltrb
+    if order == 'xywh': return self.default_boxes
+def calc_iou_tensor(boxes1, boxes2):
+  """Calculation of IoU based on two boxes tensor.
+  Reference to https://github.com/kuangliu/pytorch-ssd
+  Args:
+    boxes1: shape (N, 4), four coordinates of N boxes
+    boxes2: shape (M, 4), four coordinates of M boxes
+  Returns:
+    IoU: shape (N, M), IoU of the i-th box in `boxes1` and j-th box in `boxes2`
+  """
+  b1_left, b1_top, b1_right, b1_bottom = tf.split(boxes1, 4, axis=1)
+  b2_left, b2_top, b2_right, b2_bottom = tf.split(boxes2, 4, axis=1)
+  # Shape of intersect_* (N, M)
+  intersect_left = tf.maximum(b1_left, tf.transpose(b2_left))
+  intersect_top = tf.maximum(b1_top, tf.transpose(b2_top))
+  intersect_right = tf.minimum(b1_right, tf.transpose(b2_right))
+  intersect_bottom = tf.minimum(b1_bottom, tf.transpose(b2_bottom))
+  boxes1_area = (b1_right - b1_left) * (b1_bottom - b1_top)
+  boxes2_area = (b2_right - b2_left) * (b2_bottom - b2_top)
+  intersect = tf.multiply(tf.maximum((intersect_right - intersect_left), 0),
+                          tf.maximum((intersect_bottom - intersect_top), 0))
+  union = boxes1_area + tf.transpose(boxes2_area) - intersect
+  iou = intersect / union
+  return iou
+def ssd_parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+  Each Example proto contains the following fields that we care about:
+    image/encoded: <JPEG encoded string>
+    image/source_id: tf.string
+    image/height: tf.int64
+    image/width: tf.int64
+    image/object/bbox/xmin: tf.VarLenFeature(tf.float32)
+    image/object/bbox/xmax: tf.VarLenFeature(tf.float32)
+    image/object/bbox/ymin: tf.VarLenFeature(tf.float32
+    image/object/bbox/ymax: tf.VarLenFeature(tf.float32)
+    image/object/class/label: tf.VarLenFeature(tf.int64)
+    image/object/class/text: tf.VarLenFeature(tf.string)
+  Complete decoder can be found in:
+  https://github.com/tensorflow/models/blob/master/research/object_detection/data_decoders/tf_example_decoder.py
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+  Returns:
+    A dictionary with the following key-values:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    groundtruth_boxes: Tensor tf.float32 of shape [num_boxes, 4], containing
+      coordinates of object bounding boxes.
+    groundtruth_classeS: Tensor tf.int64 of shape [num_boxes, 1], containing
+      class labels of objects.
+    source_id: unique image identifier.
+    raw_shape: [height, width, 3].
+  """
+  feature_map = {
+      'image/encoded': tf.FixedLenFeature(
+          (), dtype=tf.string, default_value=''),
+      'image/source_id': tf.FixedLenFeature((), tf.string, default_value=''),
+      'image/height': tf.FixedLenFeature((), tf.int64, default_value=1),
+      'image/width': tf.FixedLenFeature((), tf.int64, default_value=1),
+      'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
+      'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
+      'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
+      'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
+      'image/object/class/label': tf.VarLenFeature(dtype=tf.int64),
+  }
+  features = tf.parse_single_example(example_serialized, feature_map)
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 1)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 1)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 1)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 1)
+  image_buffer = features['image/encoded']
+  # Bounding box coordinates should be in ltrb order
+  boxes = tf.concat([ymin, xmin, ymax, xmax], 1)
+  classes = tf.expand_dims(features['image/object/class/label'].values, 1)
+  source_id = features['image/source_id']
+  raw_shape = tf.stack([features['image/height'], features['image/width'], 3])
+  return {'image_buffer': image_buffer,
+          'groundtruth_boxes': boxes,
+          'groundtruth_classes': classes,
+          'source_id': source_id,
+          'raw_shape': raw_shape}
+def ssd_decode_and_crop(image_buffer, boxes, classes, raw_shape):
+  """Crop image randomly and decode the cropped region.
+  This function will crop an image to meet the following requirements:
+  1. height to width ratio between 0.5 and 2;
+  2. IoUs of some boxes exceed specified threshold;
+  3. At least one box center is in the cropped region.
+  We defer the jpeg decoding task until after the crop to avoid wasted work.
+  Reference: https://github.com/chauhan-utk/ssd.DomainAdaptation
+  Args:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    boxes: Tensor tf.float32 of shape [num_boxes, 4], containing coordinates of
+      object bounding boxes.
+    classes: Tensor tf.int64 of shape [num_boxes, 1], containing class labels
+      of objects.
+    raw_shape: [height, width, 3].
+  Returns:
+    resized_image: decoded, cropped, and resized image Tensor tf.float32 of
+      shape [ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE, 3], value
+      range 0--255.
+    cropped_boxes: box coordinates for objects in the cropped region.
+    cropped_classes: class labels for objects in the cropped region.
+  """
+  num_boxes = tf.shape(boxes)[0]
+  def no_crop_check():
+    return (tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32)
+            < ssd_constants.P_NO_CROP_PER_PASS)
+  def no_crop_proposal():
+    return (
+        tf.ones((), tf.bool),
+        tf.convert_to_tensor([0, 0, 1, 1], dtype=tf.float32),
+        tf.ones((num_boxes,), tf.bool),
+    )
+  def crop_proposal():
+    rand_vec = lambda minval, maxval: tf.random_uniform(
+        shape=(ssd_constants.NUM_CROP_PASSES, 1), minval=minval, maxval=maxval,
+        dtype=tf.float32)
+    width, height = rand_vec(0.3, 1), rand_vec(0.3, 1)
+    left, top = rand_vec(0, 1-width), rand_vec(0, 1-height)
+    right = left + width
+    bottom = top + height
+    ltrb = tf.concat([left, top, right, bottom], axis=1)
+    min_iou = tf.random_shuffle(ssd_constants.CROP_MIN_IOU_CHOICES)[0]
+    ious = calc_iou_tensor(ltrb, boxes)
+    # discard any bboxes whose center not in the cropped image
+    xc, yc = [tf.tile(0.5 * (boxes[:, i + 0] + boxes[:, i + 2])[tf.newaxis, :],
+                      (ssd_constants.NUM_CROP_PASSES, 1)) for i in range(2)]
+    masks = tf.reduce_all(tf.stack([
+        tf.greater(xc, tf.tile(left, (1, num_boxes))),
+        tf.less(xc, tf.tile(right, (1, num_boxes))),
+        tf.greater(yc, tf.tile(top, (1, num_boxes))),
+        tf.less(yc, tf.tile(bottom, (1, num_boxes))),
+    ], axis=2), axis=2)
+    # Checks of whether a crop is valid.
+    valid_aspect = tf.logical_and(tf.less(height/width, 2),
+                                  tf.less(width/height, 2))
+    valid_ious = tf.reduce_all(tf.greater(ious, min_iou), axis=1, keepdims=True)
+    valid_masks = tf.reduce_any(masks, axis=1, keepdims=True)
+    valid_all = tf.cast(tf.reduce_all(tf.concat(
+        [valid_aspect, valid_ious, valid_masks], axis=1), axis=1), tf.int32)
+    # One indexed, as zero is needed for the case of no matches.
+    index = tf.range(1, 1 + ssd_constants.NUM_CROP_PASSES, dtype=tf.int32)
+    # Either one-hot, or zeros if there is no valid crop.
+    selection = tf.equal(tf.reduce_max(index * valid_all), index)
+    use_crop = tf.reduce_any(selection)
+    output_ltrb = tf.reduce_sum(tf.multiply(ltrb, tf.tile(tf.cast(
+        selection, tf.float32)[:, tf.newaxis], (1, 4))), axis=0)
+    output_masks = tf.reduce_any(tf.logical_and(masks, tf.tile(
+        selection[:, tf.newaxis], (1, num_boxes))), axis=0)
+    return use_crop, output_ltrb, output_masks
+  def proposal(*args):
+    return tf.cond(
+        pred=no_crop_check(),
+        true_fn=no_crop_proposal,
+        false_fn=crop_proposal,
+    )
+  _, crop_bounds, box_masks = tf.while_loop(
+      cond=lambda x, *_: tf.logical_not(x),
+      body=proposal,
+      loop_vars=[tf.zeros((), tf.bool), tf.zeros((4,), tf.float32), tf.zeros((num_boxes,), tf.bool)],
+  )
+  filtered_boxes = tf.boolean_mask(boxes, box_masks, axis=0)
+  mlperf.logger.log(key=mlperf.tags.NUM_CROPPING_ITERATIONS,
+                    value=ssd_constants.NUM_CROP_PASSES)
+  # Clip boxes to the cropped region.
+  filtered_boxes = tf.stack([
+      tf.maximum(filtered_boxes[:, 0], crop_bounds[0]),
+      tf.maximum(filtered_boxes[:, 1], crop_bounds[1]),
+      tf.minimum(filtered_boxes[:, 2], crop_bounds[2]),
+      tf.minimum(filtered_boxes[:, 3], crop_bounds[3]),
+  ], axis=1)
+  left = crop_bounds[0]
+  top = crop_bounds[1]
+  width = crop_bounds[2] - left
+  height = crop_bounds[3] - top
+  cropped_boxes = tf.stack([
+      (filtered_boxes[:, 0] - left) / width,
+      (filtered_boxes[:, 1] - top) / height,
+      (filtered_boxes[:, 2] - left) / width,
+      (filtered_boxes[:, 3] - top) / height,
+  ], axis=1)
+  # crop_window containing integer coordinates of cropped region. A normalized
+  # coordinate value of y should be mapped to the image coordinate at
+  # y * (height - 1).
+  raw_shape = tf.cast(raw_shape, tf.float32)
+  crop_window = tf.stack([left * (raw_shape[0] - 1),
+                          top * (raw_shape[1] - 1),
+                          width * raw_shape[0],
+                          height * raw_shape[1]])
+  crop_window = tf.cast(crop_window, tf.int32)
+  # Fused op only decodes the cropped portion of an image
+  cropped_image = tf.image.decode_and_crop_jpeg(
+      image_buffer, crop_window, channels=3)
+  # Resize converts image dtype from uint8 to float32, without rescaling values.
+  resized_image = tf.image.resize_images(
+      cropped_image, [ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE])
+  mlperf.logger.log(key=mlperf.tags.INPUT_SIZE,
+                    value=ssd_constants.IMAGE_SIZE)
+  cropped_classes = tf.boolean_mask(classes, box_masks, axis=0)
+  return resized_image, cropped_boxes, cropped_classes
+def color_jitter(image, brightness=0, contrast=0, saturation=0, hue=0):
+  """Distort the color of the image."""
+  with tf.name_scope('distort_color'):
+    if brightness > 0:
+      image = tf.image.random_brightness(image, max_delta=brightness)
+    if contrast > 0:
+      image = tf.image.random_contrast(
+          image, lower=1-contrast, upper=1+contrast)
+    if saturation > 0:
+      image = tf.image.random_saturation(
+          image, lower=1-saturation, upper=1+saturation)
+    if hue > 0:
+      image = tf.image.random_hue(image, max_delta=hue)
+    return image
+def normalize_image(images):
+  """Normalize image to zero mean and unit variance.
+  Args:
+    images: a tensor representing images, at least 3-D.
+  Returns:
+    images normalized by mean and stdev.
+  """
+  data_type = images.dtype
+  mean = tf.constant(ssd_constants.NORMALIZATION_MEAN, data_type)
+  std = tf.constant(ssd_constants.NORMALIZATION_STD, data_type)
+  images = tf.divide(tf.subtract(images, mean), std)
+  mlperf.logger.log(key=mlperf.tags.DATA_NORMALIZATION_MEAN,
+                    value=ssd_constants.NORMALIZATION_MEAN)
+  mlperf.logger.log(key=mlperf.tags.DATA_NORMALIZATION_STD,
+                    value=ssd_constants.NORMALIZATION_STD)
+  return images
+class Encoder(object):
+  """Encoder for SSD boxes and labels."""
+  def __init__(self):
+    similarity_calc = region_similarity_calculator.IouSimilarity()
+    matcher = argmax_matcher.ArgMaxMatcher(
+        matched_threshold=ssd_constants.MATCH_THRESHOLD,
+        unmatched_threshold=ssd_constants.MATCH_THRESHOLD,
+        negatives_lower_than_unmatched=True,
+        force_match_for_each_row=True)
+    box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
+        scale_factors=ssd_constants.BOX_CODER_SCALES)
+    self.default_boxes = DefaultBoxes()('ltrb')
+    self.default_boxes = box_list.BoxList(
+        tf.convert_to_tensor(self.default_boxes))
+    self.assigner = target_assigner.TargetAssigner(
+        similarity_calc, matcher, box_coder)
+  def encode_labels(self, gt_boxes, gt_labels):
+    target_boxes = box_list.BoxList(gt_boxes)
+    encoded_classes, _, encoded_boxes, _, matches = self.assigner.assign(
+        self.default_boxes, target_boxes, gt_labels)
+    num_matched_boxes = tf.reduce_sum(
+        tf.cast(tf.not_equal(matches, -1), tf.float32))
+    return encoded_classes, encoded_boxes, num_matched_boxes
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/__init__.py
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/__init__.py
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00000-of-00008
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00000-of-00008
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00001-of-00008
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00001-of-00008
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00002-of-00008
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00002-of-00008
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00003-of-00008
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00003-of-00008
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00004-of-00008
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00004-of-00008
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00005-of-00008
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00005-of-00008
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00006-of-00008
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00006-of-00008
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00007-of-00008
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/train-00007-of-00008
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/validation-00000-of-00002
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/validation-00000-of-00002
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/validation-00001-of-00002
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/fake_tf_record_data/validation-00001-of-00002
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/images/black_image.jpg
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/images/black_image.jpg
--- a/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/images/white_image.jpg
+++ b/TensorFlow/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/test_data/images/white_image.jpg