new tf branch for dtk21.10.1

ee3997b3 · qianyj · 2795dc1f · ee3997b3 · ee3997b3 · ee3997b3
Commit ee3997b3 authored Apr 15, 2022 by qianyj
20 changed files
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/resnet_model.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/resnet_model.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/resnet_model_test.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/resnet_model_test.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/platforms/default/__init__.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/platforms/default/__init__.py
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/models/tf1_only/mobilenet.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/models/tf1_only/mobilenet.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mobilenet Base Class, branched from slim for fp16 performance study."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import contextlib
+import copy
+import os
+
+import tensorflow.compat.v1 as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+
+@slim.add_arg_scope
+def apply_activation(x, name=None, activation_fn=None):
+  return activation_fn(x, name=name) if activation_fn else x
+
+
+def _fixed_padding(inputs, kernel_size, rate=1):
+  """Pads the input along the spatial dimensions independently of input size.
+
+  Pads the input such that if it was used in a convolution with 'VALID' padding,
+  the output would have the same dimensions as if the unpadded input was used
+  in a convolution with 'SAME' padding.
+
+  Args:
+    inputs: A tensor of size [batch, height_in, width_in, channels].
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+    rate: An integer, rate for atrous convolution.
+
+  Returns:
+    output: A tensor of size [batch, height_out, width_out, channels] with the
+      input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+  kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
+                           kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
+  pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
+  pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
+  pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
+  padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
+                                  [pad_beg[1], pad_end[1]], [0, 0]])
+  return padded_inputs
+
+
+def _make_divisible(v, divisor, min_value=None):
+  if min_value is None:
+    min_value = divisor
+  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_v < 0.9 * v:
+    new_v += divisor
+  return new_v
+
+
+@contextlib.contextmanager
+def _set_arg_scope_defaults(defaults):
+  """Sets arg scope defaults for all items present in defaults.
+
+  Args:
+    defaults: dictionary/list of pairs, containing a mapping from
+    function to a dictionary of default args.
+
+  Yields:
+    context manager where all defaults are set.
+  """
+  if hasattr(defaults, 'items'):
+    items = list(defaults.items())
+  else:
+    items = defaults
+  if not items:
+    yield
+  else:
+    func, default_arg = items[0]
+    with slim.arg_scope(func, **default_arg):
+      with _set_arg_scope_defaults(items[1:]):
+        yield
+
+
+@slim.add_arg_scope
+def depth_multiplier(output_params,
+                     multiplier,
+                     divisible_by=8,
+                     min_depth=8,
+                     **unused_kwargs):
+  if 'num_outputs' not in output_params:
+    return
+  d = output_params['num_outputs']
+  output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by,
+                                                 min_depth)
+
+
+_Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func'])
+
+
+def op(opfunc, **params):
+  multiplier = params.pop('multiplier_transorm', depth_multiplier)
+  return _Op(opfunc, params=params, multiplier_func=multiplier)
+
+
+class NoOpScope(object):
+  """No-op context manager."""
+
+  def __enter__(self):
+    return
+
+  def __exit__(self, exc_type, exc_value, traceback):
+    return False
+
+
+def safe_arg_scope(funcs, **kwargs):
+  """Returns `slim.arg_scope` with all None arguments removed.
+
+  Args:
+    funcs: Functions to pass to `arg_scope`.
+    **kwargs: Arguments to pass to `arg_scope`.
+
+  Returns:
+    arg_scope or No-op context manager.
+
+  Note: can be useful if None value should be interpreted as "do not overwrite
+    this parameter value".
+  """
+  filtered_args = {name: value for name, value in kwargs.items()
+                   if value is not None}
+  if filtered_args:
+    return slim.arg_scope(funcs, **filtered_args)
+  else:
+    return NoOpScope()
+
+
+@slim.add_arg_scope
+def mobilenet_base(  # pylint: disable=invalid-name
+    inputs,
+    conv_defs,
+    multiplier=1.0,
+    final_endpoint=None,
+    output_stride=None,
+    use_explicit_padding=False,
+    scope=None,
+    is_training=False):
+  """Mobilenet base network.
+
+  Constructs a network from inputs to the given final endpoint. By default
+  the network is constructed in inference mode. To create network
+  in training mode use:
+
+  with slim.arg_scope(mobilenet.training_scope()):
+     logits, endpoints = mobilenet_base(...)
+
+  Args:
+    inputs: a tensor of shape [batch_size, height, width, channels].
+    conv_defs: A list of op(...) layers specifying the net architecture.
+    multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    final_endpoint: The name of last layer, for early termination for
+    for V1-based networks: last layer is "layer_14", for V2: "layer_20"
+    output_stride: An integer that specifies the requested ratio of input to
+      output spatial resolution. If not None, then we invoke atrous convolution
+      if necessary to prevent the network from reducing the spatial resolution
+      of the activation maps. Allowed values are 1 or any even number, excluding
+      zero. Typical values are 8 (accurate fully convolutional mode), 16
+      (fast fully convolutional mode), and 32 (classification mode).
+
+      NOTE- output_stride relies on all consequent operators to support dilated
+      operators via "rate" parameter. This might require wrapping non-conv
+      operators to operate properly.
+
+    use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+      inputs so that the output dimensions are the same as if 'SAME' padding
+      were used.
+    scope: optional variable scope.
+    is_training: How to setup batch_norm and other ops. Note: most of the time
+      this does not need be set directly. Use mobilenet.training_scope() to set
+      up training instead. This parameter is here for backward compatibility
+      only. It is safe to set it to the value matching
+      training_scope(is_training=...). It is also safe to explicitly set
+      it to False, even if there is outer training_scope set to to training.
+      (The network will be built in inference mode). If this is set to None,
+      no arg_scope is added for slim.batch_norm's is_training parameter.
+
+  Returns:
+    tensor_out: output tensor.
+    end_points: a set of activations for external use, for example summaries or
+                losses.
+
+  Raises:
+    ValueError: depth_multiplier <= 0, or the target output_stride is not
+                allowed.
+  """
+  if multiplier <= 0:
+    raise ValueError('multiplier is not greater than zero.')
+
+  # Set conv defs defaults and overrides.
+  conv_defs_defaults = conv_defs.get('defaults', {})
+  conv_defs_overrides = conv_defs.get('overrides', {})
+  if use_explicit_padding:
+    conv_defs_overrides = copy.deepcopy(conv_defs_overrides)
+    conv_defs_overrides[
+        (slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'}
+
+  if output_stride is not None:
+    if output_stride == 0 or (output_stride > 1 and output_stride % 2):
+      raise ValueError('Output stride must be None, 1 or a multiple of 2.')
+
+  # a) Set the tensorflow scope
+  # b) set padding to default: note we might consider removing this
+  # since it is also set by mobilenet_scope
+  # c) set all defaults
+  # d) set all extra overrides.
+  with _scope_all(scope, default_scope='Mobilenet'), \
+      safe_arg_scope([slim.batch_norm], is_training=is_training), \
+      _set_arg_scope_defaults(conv_defs_defaults), \
+      _set_arg_scope_defaults(conv_defs_overrides):
+    # The current_stride variable keeps track of the output stride of the
+    # activations, i.e., the running product of convolution strides up to the
+    # current network layer. This allows us to invoke atrous convolution
+    # whenever applying the next convolution would result in the activations
+    # having output stride larger than the target output_stride.
+    current_stride = 1
+
+    # The atrous convolution rate parameter.
+    rate = 1
+
+    net = inputs
+    # Insert default parameters before the base scope which includes
+    # any custom overrides set in mobilenet.
+    end_points = {}
+    scopes = {}
+    for i, opdef in enumerate(conv_defs['spec']):
+      params = dict(opdef.params)
+      opdef.multiplier_func(params, multiplier)
+      stride = params.get('stride', 1)
+      if output_stride is not None and current_stride == output_stride:
+        # If we have reached the target output_stride, then we need to employ
+        # atrous convolution with stride=1 and multiply the atrous rate by the
+        # current unit's stride for use in subsequent layers.
+        layer_stride = 1
+        layer_rate = rate
+        rate *= stride
+      else:
+        layer_stride = stride
+        layer_rate = 1
+        current_stride *= stride
+      # Update params.
+      params['stride'] = layer_stride
+      # Only insert rate to params if rate > 1.
+      if layer_rate > 1:
+        params['rate'] = layer_rate
+      # Set padding
+      if use_explicit_padding:
+        if 'kernel_size' in params:
+          net = _fixed_padding(net, params['kernel_size'], layer_rate)
+        else:
+          params['use_explicit_padding'] = True
+
+      end_point = 'layer_%d' % (i + 1)
+      try:
+        net = opdef.op(net, **params)
+      except Exception:
+        print('Failed to create op %i: %r params: %r' % (i, opdef, params))
+        raise
+      end_points[end_point] = net
+      scope = os.path.dirname(net.name)
+      scopes[scope] = end_point
+      if final_endpoint is not None and end_point == final_endpoint:
+        break
+
+    # Add all tensors that end with 'output' to
+    # endpoints
+    for t in net.graph.get_operations():
+      scope = os.path.dirname(t.name)
+      bn = os.path.basename(t.name)
+      if scope in scopes and t.name.endswith('output'):
+        end_points[scopes[scope] + '/' + bn] = t.outputs[0]
+    return net, end_points
+
+
+@contextlib.contextmanager
+def _scope_all(scope, default_scope=None):
+  with tf.variable_scope(scope, default_name=default_scope) as s,\
+       tf.name_scope(s.original_name_scope):
+    yield s
+
+
+@slim.add_arg_scope
+def mobilenet(inputs,
+              num_classes=1001,
+              prediction_fn=slim.softmax,
+              reuse=None,
+              scope='Mobilenet',
+              base_only=False,
+              **mobilenet_args):
+  """Mobilenet model for classification, supports both V1 and V2.
+
+  Note: default mode is inference, use mobilenet.training_scope to create
+  training network.
+
+
+  Args:
+    inputs: a tensor of shape [batch_size, height, width, channels].
+    num_classes: number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer (before dropout)
+      are returned instead.
+    prediction_fn: a function to get predictions out of logits
+      (default softmax).
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    scope: Optional variable_scope.
+    base_only: if True will only create the base of the network (no pooling
+    and no logits).
+    **mobilenet_args: passed to mobilenet_base verbatim.
+      - conv_defs: list of conv defs
+      - multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+      - output_stride: will ensure that the last layer has at most total stride.
+      If the architecture calls for more stride than that provided
+      (e.g. output_stride=16, but the architecture has 5 stride=2 operators),
+      it will replace output_stride with fractional convolutions using Atrous
+      Convolutions.
+
+  Returns:
+    logits: the pre-softmax activations, a tensor of size
+      [batch_size, num_classes]
+    end_points: a dictionary from components of the network to the corresponding
+      activation tensor.
+
+  Raises:
+    ValueError: Input rank is invalid.
+  """
+  is_training = mobilenet_args.get('is_training', False)
+  input_shape = inputs.get_shape().as_list()
+  if len(input_shape) != 4:
+    raise ValueError('Expected rank 4 input, was: %d' % len(input_shape))
+
+  with tf.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope:
+    inputs = tf.identity(inputs, 'input')
+    net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args)
+    if base_only:
+      return net, end_points
+
+    net = tf.identity(net, name='embedding')
+
+    with tf.variable_scope('Logits'):
+      net = global_pool(net)
+      end_points['global_pool'] = net
+      if not num_classes:
+        return net, end_points
+      net = slim.dropout(net, scope='Dropout', is_training=is_training)
+      # 1 x 1 x num_classes
+      # Note: legacy scope name.
+      logits = slim.conv2d(
+          net,
+          num_classes, [1, 1],
+          activation_fn=None,
+          normalizer_fn=None,
+          biases_initializer=tf.zeros_initializer(),
+          scope='Conv2d_1c_1x1')
+
+      logits = tf.squeeze(logits, [1, 2])
+
+      logits = tf.identity(logits, name='output')
+    end_points['Logits'] = logits
+    if prediction_fn:
+      end_points['Predictions'] = prediction_fn(logits, 'Predictions')
+  return logits, end_points
+
+
+def global_pool(input_tensor, pool_op=tf.nn.avg_pool):
+  """Applies avg pool to produce 1x1 output.
+
+  NOTE: This function is funcitonally equivalenet to reduce_mean, but it has
+  baked in average pool which has better support across hardware.
+
+  Args:
+    input_tensor: input tensor
+    pool_op: pooling op (avg pool is default)
+  Returns:
+    a tensor batch_size x 1 x 1 x depth.
+  """
+  shape = input_tensor.get_shape().as_list()
+  if shape[1] is None or shape[2] is None:
+    kernel_size = tf.convert_to_tensor(
+        [1, tf.shape(input_tensor)[1],
+         tf.shape(input_tensor)[2], 1])
+  else:
+    kernel_size = [1, shape[1], shape[2], 1]
+  output = pool_op(
+      input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID')
+  # Recover output shape, for unknown shape.
+  output.set_shape([None, 1, 1, None])
+  return output
+
+
+def training_scope(is_training=True,
+                   weight_decay=0.00004,
+                   stddev=0.09,
+                   dropout_keep_prob=0.8,
+                   bn_decay=0.997):
+  """Defines Mobilenet training scope.
+
+  Usage:
+     with tf.contrib.slim.arg_scope(mobilenet.training_scope()):
+       logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
+
+     # the network created will be trainble with dropout/batch norm
+     # initialized appropriately.
+  Args:
+    is_training: if set to False this will ensure that all customizations are
+      set to non-training mode. This might be helpful for code that is reused
+      across both training/evaluation, but most of the time training_scope with
+      value False is not needed. If this is set to None, the parameters is not
+      added to the batch_norm arg_scope.
+
+    weight_decay: The weight decay to use for regularizing the model.
+    stddev: Standard deviation for initialization, if negative uses xavier.
+    dropout_keep_prob: dropout keep probability (not set if equals to None).
+    bn_decay: decay for the batch norm moving averages (not set if equals to
+      None).
+
+  Returns:
+    An argument scope to use via arg_scope.
+  """
+  # Note: do not introduce parameters that would change the inference
+  # model here (for example whether to use bias), modify conv_def instead.
+  batch_norm_params = {
+      'decay': bn_decay,
+      'is_training': is_training
+  }
+  if stddev < 0:
+    weight_intitializer = slim.initializers.xavier_initializer()
+  else:
+    weight_intitializer = tf.truncated_normal_initializer(stddev=stddev)
+
+  # Set weight_decay for weights in Conv and FC layers.
+  with slim.arg_scope(
+      [slim.conv2d, slim.fully_connected, slim.separable_conv2d],
+      weights_initializer=weight_intitializer,
+      normalizer_fn=slim.batch_norm), \
+      slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training),\
+      safe_arg_scope([slim.batch_norm], **batch_norm_params), \
+      safe_arg_scope([slim.dropout], is_training=is_training,
+                     keep_prob=dropout_keep_prob), \
+      slim.arg_scope([slim.conv2d], \
+                     weights_regularizer=slim.l2_regularizer(weight_decay)), \
+      slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s:
+    return s
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/mobilenet_conv_blocks.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/mobilenet_conv_blocks.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/mobilenet_test.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/mobilenet_test.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/mobilenet_v2.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/mobilenet_v2.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/nasnet_model.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/nasnet_model.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/nasnet_test.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/nasnet_test.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/nasnet_utils.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/tf1_only/nasnet_utils.py
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/models/tf1_only/ssd_model.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/models/tf1_only/ssd_model.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+"""SSD300 Model Configuration.
+
+References:
+  Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+  Cheng-Yang Fu, Alexander C. Berg
+  SSD: Single Shot MultiBox Detector
+  arXiv:1512.02325
+
+Ported from MLPerf reference implementation:
+  https://github.com/mlperf/reference/tree/ssd/single_stage_detector/ssd
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+import re
+import threading
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+import constants
+import mlperf
+import ssd_constants
+from cnn_util import log_fn
+from models import model as model_lib
+from models import resnet_model
+from tensorflow.contrib import layers as contrib_layers
+from tensorflow.python.ops import variables
+
+BACKBONE_MODEL_SCOPE_NAME = 'resnet34_backbone'
+
+
+class SSD300Model(model_lib.CNNModel):
+  """Single Shot Multibox Detection (SSD) model for 300x300 image datasets."""
+
+  def __init__(self, label_num=ssd_constants.NUM_CLASSES, batch_size=32,
+               learning_rate=1e-3, backbone='resnet34', params=None):
+    super(SSD300Model, self).__init__('ssd300', 300, batch_size, learning_rate,
+                                      params=params)
+    # For COCO dataset, 80 categories + 1 background = 81 labels
+    self.label_num = label_num
+
+    # Currently only support ResNet-34 as backbone model
+    if backbone != 'resnet34':
+      raise ValueError('Invalid backbone model %s for SSD.' % backbone)
+    mlperf.logger.log(key=mlperf.tags.BACKBONE, value=backbone)
+
+    # Number of channels and default boxes associated with the following layers:
+    #   ResNet34 layer, Conv7, Conv8_2, Conv9_2, Conv10_2, Conv11_2
+    self.out_chan = [256, 512, 512, 256, 256, 256]
+    mlperf.logger.log(key=mlperf.tags.LOC_CONF_OUT_CHANNELS,
+                      value=self.out_chan)
+
+    # Number of default boxes from layers of different scales
+    #   38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
+    self.num_dboxes = [4, 6, 6, 6, 4, 4]
+    mlperf.logger.log(key=mlperf.tags.NUM_DEFAULTS_PER_CELL,
+                      value=self.num_dboxes)
+
+    # TODO(haoyuzhang): in order to correctly restore in replicated mode, need
+    # to create a saver for each tower before graph is finalized. Use variable
+    # manager for better efficiency.
+    self.backbone_savers = []
+
+    # Collected predictions for eval stage. It maps each image id in eval
+    # dataset to a dict containing the following information:
+    #   source_id: raw ID of image
+    #   raw_shape: raw shape of image
+    #   pred_box: encoded box coordinates of prediction
+    #   pred_scores: scores of classes in prediction
+    self.predictions = {}
+
+    # Global step when predictions are collected.
+    self.eval_global_step = 0
+
+    # Average precision. In asynchronous eval mode, this is the latest AP we
+    # get so far and may not be the results at current eval step.
+    self.eval_coco_ap = 0
+
+    # Process, queues, and thread for asynchronous evaluation. When enabled,
+    # create a separate process (async_eval_process) that continuously pull
+    # intermediate results from the predictions queue (a multiprocessing queue),
+    # process them, and push final results into results queue (another
+    # multiprocessing queue). The main thread is responsible to push messages
+    # into predictions queue, and start a separate thread to continuously pull
+    # messages from results queue to update final results.
+    # Message in predictions queue should be a tuple of two elements:
+    #    (evaluation step, predictions)
+    # Message in results queue should be a tuple of two elements:
+    #    (evaluation step, final results)
+    self.async_eval_process = None
+    self.async_eval_predictions_queue = None
+    self.async_eval_results_queue = None
+    self.async_eval_results_getter_thread = None
+
+    # The MLPerf reference uses a starting lr of 1e-3 at bs=32.
+    self.base_lr_batch_size = 32
+
+  def skip_final_affine_layer(self):
+    return True
+
+  def gpu_preprocess_nhwc(self, images, phase_train=True):
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation ; To evaluate using COCO'
+                        'metric, download and install Python COCO API from'
+                        'https://github.com/cocodataset/cocoapi')
+
+    if phase_train:
+      images = ssd_dataloader.color_jitter(
+          images, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05)
+      images = ssd_dataloader.normalize_image(images)
+    return images
+
+  def add_backbone_model(self, cnn):
+    # --------------------------------------------------------------------------
+    # Resnet-34 backbone model -- modified for SSD
+    # --------------------------------------------------------------------------
+
+    # Input 300x300, output 150x150
+    cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True)
+    cnn.mpool(3, 3, 2, 2, mode='SAME')
+
+    resnet34_layers = [3, 4, 6, 3]
+    version = 'v1'
+
+    # ResNet-34 block group 1
+    # Input 150x150, output 75x75
+    for i in range(resnet34_layers[0]):
+      # Last argument forces residual_block to use projection shortcut, even
+      # though the numbers of input and output channels are equal
+      resnet_model.residual_block(cnn, 64, 1, version)
+
+    # ResNet-34 block group 2
+    # Input 75x75, output 38x38
+    for i in range(resnet34_layers[1]):
+      stride = 2 if i == 0 else 1
+      resnet_model.residual_block(cnn, 128, stride, version, i == 0)
+
+    # ResNet-34 block group 3
+    # This block group is modified: first layer uses stride=1 so that the image
+    # size does not change in group of layers
+    # Input 38x38, output 38x38
+    for i in range(resnet34_layers[2]):
+      # The following line is intentionally commented out to differentiate from
+      # the original ResNet-34 model
+      # stride = 2 if i == 0 else 1
+      resnet_model.residual_block(cnn, 256, stride, version, i == 0)
+
+    # ResNet-34 block group 4: removed final block group
+    # The following 3 lines are intentionally commented out to differentiate
+    # from the original ResNet-34 model
+    # for i in range(resnet34_layers[3]):
+    #   stride = 2 if i == 0 else 1
+    #   resnet_model.residual_block(cnn, 512, stride, version, i == 0)
+
+  def add_inference(self, cnn):
+    cnn.use_batch_norm = True
+    cnn.batch_norm_config = {'decay': ssd_constants.BATCH_NORM_DECAY,
+                             'epsilon': ssd_constants.BATCH_NORM_EPSILON,
+                             'scale': True}
+
+    with tf.variable_scope(BACKBONE_MODEL_SCOPE_NAME):
+      self.add_backbone_model(cnn)
+
+    # --------------------------------------------------------------------------
+    # SSD additional layers
+    # --------------------------------------------------------------------------
+
+    def add_ssd_layer(cnn, depth, k_size, stride, mode):
+      return cnn.conv(
+          depth,
+          k_size,
+          k_size,
+          stride,
+          stride,
+          mode=mode,
+          use_batch_norm=False,
+          kernel_initializer=contrib_layers.xavier_initializer())
+
+    # Activations for feature maps of different layers
+    self.activations = [cnn.top_layer]
+    # Conv7_1, Conv7_2
+    # Input 38x38, output 19x19
+    add_ssd_layer(cnn, 256, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))
+
+    # Conv8_1, Conv8_2
+    # Input 19x19, output 10x10
+    add_ssd_layer(cnn, 256, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))
+
+    # Conv9_1, Conv9_2
+    # Input 10x10, output 5x5
+    add_ssd_layer(cnn, 128, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 256, 3, 2, 'same'))
+
+    # Conv10_1, Conv10_2
+    # Input 5x5, output 3x3
+    add_ssd_layer(cnn, 128, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))
+
+    # Conv11_1, Conv11_2
+    # Input 3x3, output 1x1
+    add_ssd_layer(cnn, 128, 1, 1, 'valid')
+    self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))
+
+    self.loc = []
+    self.conf = []
+
+    for nd, ac, oc in zip(self.num_dboxes, self.activations, self.out_chan):
+      l = cnn.conv(
+          nd * 4,
+          3,
+          3,
+          1,
+          1,
+          input_layer=ac,
+          num_channels_in=oc,
+          activation=None,
+          use_batch_norm=False,
+          kernel_initializer=contrib_layers.xavier_initializer())
+      scale = l.get_shape()[-1]
+      # shape = [batch_size, nd * 4, scale, scale]
+      l = tf.reshape(l, [self.batch_size, nd, 4, scale, scale])
+      # shape = [batch_size, nd, 4, scale, scale]
+      l = tf.transpose(l, [0, 1, 3, 4, 2])
+      # shape = [batch_size, nd, scale, scale, 4]
+      self.loc.append(tf.reshape(l, [self.batch_size, -1, 4]))
+      # shape = [batch_size, nd * scale * scale, 4]
+
+      c = cnn.conv(
+          nd * self.label_num,
+          3,
+          3,
+          1,
+          1,
+          input_layer=ac,
+          num_channels_in=oc,
+          activation=None,
+          use_batch_norm=False,
+          kernel_initializer=contrib_layers.xavier_initializer())
+      # shape = [batch_size, nd * label_num, scale, scale]
+      c = tf.reshape(c, [self.batch_size, nd, self.label_num, scale, scale])
+      # shape = [batch_size, nd, label_num, scale, scale]
+      c = tf.transpose(c, [0, 1, 3, 4, 2])
+      # shape = [batch_size, nd, scale, scale, label_num]
+      self.conf.append(tf.reshape(c, [self.batch_size, -1, self.label_num]))
+      # shape = [batch_size, nd * scale * scale, label_num]
+
+    # Shape of locs: [batch_size, NUM_SSD_BOXES, 4]
+    # Shape of confs: [batch_size, NUM_SSD_BOXES, label_num]
+    locs, confs = tf.concat(self.loc, 1), tf.concat(self.conf, 1)
+
+    # Pack location and confidence outputs into a single output layer
+    # Shape of logits: [batch_size, NUM_SSD_BOXES, 4+label_num]
+    logits = tf.concat([locs, confs], 2)
+
+    cnn.top_layer = logits
+    cnn.top_size = 4 + self.label_num
+
+    return cnn.top_layer
+
+  def get_learning_rate(self, global_step, batch_size):
+    rescaled_lr = self.get_scaled_base_learning_rate(batch_size)
+    # Defined in MLPerf reference model
+    boundaries = [160000, 200000]
+    boundaries = [b * self.base_lr_batch_size // batch_size for b in boundaries]
+    decays = [1, 0.1, 0.01]
+    learning_rates = [rescaled_lr * d for d in decays]
+    lr = tf.train.piecewise_constant(global_step, boundaries, learning_rates)
+    warmup_steps = int(118287 / batch_size * 5)
+    warmup_lr = (
+        rescaled_lr * tf.cast(global_step, tf.float32) / tf.cast(
+            warmup_steps, tf.float32))
+    return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr)
+
+  def get_scaled_base_learning_rate(self, batch_size):
+    """Calculates base learning rate for creating lr schedule.
+
+    In replicated mode, gradients are summed rather than averaged which, with
+    the sgd and momentum optimizers, increases the effective learning rate by
+    lr * num_gpus. Dividing the base lr by num_gpus negates the increase.
+
+    Args:
+      batch_size: Total batch-size.
+
+    Returns:
+      Base learning rate to use to create lr schedule.
+    """
+    base_lr = self.learning_rate
+    if self.params.variable_update == 'replicated':
+      base_lr = self.learning_rate / self.params.num_gpus
+    scaled_lr = base_lr * (batch_size / self.base_lr_batch_size)
+    return scaled_lr
+
+  def _collect_backbone_vars(self):
+    backbone_vars = tf.get_collection(
+        tf.GraphKeys.GLOBAL_VARIABLES, scope='.*'+ BACKBONE_MODEL_SCOPE_NAME)
+    var_list = {}
+
+    # Assume variables in the checkpoint are following the naming convention of
+    # a model checkpoint trained with TF official model
+    # TODO(haoyuzhang): the following variable name parsing is hacky and easy
+    # to break if there is change in naming convention of either benchmarks or
+    # official models.
+    for v in backbone_vars:
+      # conv2d variable example (model <-- checkpoint):
+      #   v/cg/conv24/conv2d/kernel:0 <-- conv2d_24/kernel
+      if 'conv2d' in v.name:
+        re_match = re.search(r'conv(\d+)/conv2d/(.+):', v.name)
+        if re_match:
+          layer_id = int(re_match.group(1))
+          param_name = re_match.group(2)
+          vname_in_ckpt = self._var_name_in_official_model_ckpt(
+              'conv2d', layer_id, param_name)
+          var_list[vname_in_ckpt] = v
+
+      # batchnorm varariable example:
+      #   v/cg/conv24/batchnorm25/gamma:0 <-- batch_normalization_25/gamma
+      elif 'batchnorm' in v.name:
+        re_match = re.search(r'batchnorm(\d+)/(.+):', v.name)
+        if re_match:
+          layer_id = int(re_match.group(1))
+          param_name = re_match.group(2)
+          vname_in_ckpt = self._var_name_in_official_model_ckpt(
+              'batch_normalization', layer_id, param_name)
+          var_list[vname_in_ckpt] = v
+
+    return var_list
+
+  def _var_name_in_official_model_ckpt(self, layer_name, layer_id, param_name):
+    """Return variable names according to convention in TF official models."""
+    vname_in_ckpt = layer_name
+    if layer_id > 0:
+      vname_in_ckpt += '_' + str(layer_id)
+    vname_in_ckpt += '/' + param_name
+    return vname_in_ckpt
+
+  def loss_function(self, inputs, build_network_result):
+    logits = build_network_result.logits
+
+    # Unpack model output back to locations and confidence scores of predictions
+    # Shape of pred_loc: [batch_size, NUM_SSD_BOXES, 4]
+    # Shape of pred_label: [batch_size, NUM_SSD_BOXES, label_num]
+    pred_loc, pred_label = tf.split(logits, [4, self.label_num], 2)
+
+    # Shape of gt_loc: [batch_size, NUM_SSD_BOXES, 4]
+    # Shape of gt_label: [batch_size, NUM_SSD_BOXES, 1]
+    # Shape of num_gt: [batch_size]
+    _, gt_loc, gt_label, num_gt = inputs
+    gt_label = tf.cast(gt_label, tf.int32)
+
+    box_loss = self._localization_loss(pred_loc, gt_loc, gt_label, num_gt)
+    class_loss = self._classification_loss(pred_label, gt_label, num_gt)
+
+    tf.summary.scalar('box_loss', tf.reduce_mean(box_loss))
+    tf.summary.scalar('class_loss', tf.reduce_mean(class_loss))
+    return class_loss + box_loss
+
+  def _localization_loss(self, pred_loc, gt_loc, gt_label, num_matched_boxes):
+    """Computes the localization loss.
+
+    Computes the localization loss using smooth l1 loss.
+    Args:
+      pred_loc: a flatten tensor that includes all predicted locations. The
+        shape is [batch_size, num_anchors, 4].
+      gt_loc: a tensor representing box regression targets in
+        [batch_size, num_anchors, 4].
+      gt_label: a tensor that represents the classification groundtruth targets.
+        The shape is [batch_size, num_anchors, 1].
+      num_matched_boxes: the number of anchors that are matched to a groundtruth
+        targets, used as the loss normalizater. The shape is [batch_size].
+    Returns:
+      box_loss: a float32 representing total box regression loss.
+    """
+    mask = tf.greater(tf.squeeze(gt_label), 0)
+    float_mask = tf.cast(mask, tf.float32)
+
+    smooth_l1 = tf.reduce_sum(tf.losses.huber_loss(
+        gt_loc, pred_loc,
+        reduction=tf.losses.Reduction.NONE
+    ), axis=2)
+    smooth_l1 = tf.multiply(smooth_l1, float_mask)
+    box_loss = tf.reduce_sum(smooth_l1, axis=1)
+
+    return tf.reduce_mean(box_loss / num_matched_boxes)
+
+  def _classification_loss(self, pred_label, gt_label, num_matched_boxes):
+    """Computes the classification loss.
+
+    Computes the classification loss with hard negative mining.
+    Args:
+      pred_label: a flatten tensor that includes all predicted class. The shape
+        is [batch_size, num_anchors, num_classes].
+      gt_label: a tensor that represents the classification groundtruth targets.
+        The shape is [batch_size, num_anchors, 1].
+      num_matched_boxes: the number of anchors that are matched to a groundtruth
+        targets. This is used as the loss normalizater.
+
+    Returns:
+      box_loss: a float32 representing total box regression loss.
+    """
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+        gt_label, pred_label, reduction=tf.losses.Reduction.NONE)
+
+    mask = tf.greater(tf.squeeze(gt_label), 0)
+    float_mask = tf.cast(mask, tf.float32)
+
+    # Hard example mining
+    neg_masked_cross_entropy = cross_entropy * (1 - float_mask)
+    relative_position = tf.argsort(
+        tf.argsort(
+            neg_masked_cross_entropy, direction='DESCENDING'))
+    num_neg_boxes = tf.minimum(
+        tf.to_int32(num_matched_boxes) * ssd_constants.NEGS_PER_POSITIVE,
+        ssd_constants.NUM_SSD_BOXES)
+    top_k_neg_mask = tf.cast(tf.less(
+        relative_position,
+        tf.tile(num_neg_boxes[:, tf.newaxis], (1, ssd_constants.NUM_SSD_BOXES))
+    ), tf.float32)
+
+    class_loss = tf.reduce_sum(
+        tf.multiply(cross_entropy, float_mask + top_k_neg_mask), axis=1)
+
+    return tf.reduce_mean(class_loss / num_matched_boxes)
+
+  def add_backbone_saver(self):
+    # Create saver with mapping from variable names in checkpoint of backbone
+    # model to variables in SSD model
+    backbone_var_list = self._collect_backbone_vars()
+    self.backbone_savers.append(tf.train.Saver(backbone_var_list))
+
+  def load_backbone_model(self, sess, backbone_model_path):
+    for saver in self.backbone_savers:
+      saver.restore(sess, backbone_model_path)
+
+  def get_input_data_types(self, subset):
+    if subset == 'validation':
+      return [self.data_type, tf.float32, tf.float32, tf.float32, tf.int32]
+    return [self.data_type, tf.float32, tf.float32, tf.float32]
+
+  def get_input_shapes(self, subset):
+    """Return encoded tensor shapes for train and eval data respectively."""
+    if subset == 'validation':
+      # Validation data shapes:
+      # 1. images
+      # 2. ground truth locations of boxes
+      # 3. ground truth classes of objects in boxes
+      # 4. source image IDs
+      # 5. raw image shapes
+      return [
+          [self.batch_size, self.image_size, self.image_size, self.depth],
+          [self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 4],
+          [self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 1],
+          [self.batch_size],
+          [self.batch_size, 3],
+      ]
+
+    # Training data shapes:
+    # 1. images
+    # 2. ground truth locations of boxes
+    # 3. ground truth classes of objects in boxes
+    # 4. numbers of objects in images
+    return [
+        [self.batch_size, self.image_size, self.image_size, self.depth],
+        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 4],
+        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 1],
+        [self.batch_size]
+    ]
+
+  def accuracy_function(self, inputs, logits):
+    """Returns the ops to measure the mean precision of the model."""
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+      from object_detection.box_coders import faster_rcnn_box_coder  # pylint: disable=g-import-not-at-top
+      from object_detection.core import box_coder  # pylint: disable=g-import-not-at-top
+      from object_detection.core import box_list  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation ; To evaluate using COCO'
+                        'metric, download and install Python COCO API from'
+                        'https://github.com/cocodataset/cocoapi')
+
+    # Unpack model output back to locations and confidence scores of predictions
+    # pred_locs: relative locations (coordinates) of objects in all SSD boxes
+    # shape: [batch_size, NUM_SSD_BOXES, 4]
+    # pred_labels: confidence scores of objects being of all categories
+    # shape: [batch_size, NUM_SSD_BOXES, label_num]
+    pred_locs, pred_labels = tf.split(logits, [4, self.label_num], 2)
+
+    ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
+        scale_factors=ssd_constants.BOX_CODER_SCALES)
+    anchors = box_list.BoxList(
+        tf.convert_to_tensor(ssd_dataloader.DefaultBoxes()('ltrb')))
+    pred_boxes = box_coder.batch_decode(
+        encoded_boxes=pred_locs, box_coder=ssd_box_coder, anchors=anchors)
+
+    pred_scores = tf.nn.softmax(pred_labels, axis=2)
+
+    # TODO(haoyuzhang): maybe use `gt_boxes` and `gt_classes` for visualization.
+    _, gt_boxes, gt_classes, source_id, raw_shape = inputs  # pylint: disable=unused-variable
+
+    return {
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
+         ssd_constants.PRED_BOXES): pred_boxes,
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
+         ssd_constants.PRED_SCORES): pred_scores,
+        # TODO(haoyuzhang): maybe use these values for visualization.
+        # constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_boxes': gt_boxes,
+        # constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_classes': gt_classes,
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
+         ssd_constants.SOURCE_ID): source_id,
+        (constants.UNREDUCED_ACCURACY_OP_PREFIX +
+         ssd_constants.RAW_SHAPE): raw_shape
+    }
+
+  def postprocess(self, results):
+    """Postprocess results returned from model."""
+    try:
+      import coco_metric  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation ; To evaluate using COCO'
+                        'metric, download and install Python COCO API from'
+                        'https://github.com/cocodataset/cocoapi')
+
+    pred_boxes = results[ssd_constants.PRED_BOXES]
+    pred_scores = results[ssd_constants.PRED_SCORES]
+    # TODO(haoyuzhang): maybe use these values for visualization.
+    # gt_boxes = results['gt_boxes']
+    # gt_classes = results['gt_classes']
+    source_id = results[ssd_constants.SOURCE_ID]
+    raw_shape = results[ssd_constants.RAW_SHAPE]
+
+    # COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due
+    # to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting
+    # `num_eval_epochs` to 1 is not enough and will often miss some images. We
+    # expect user to set `num_eval_epochs` to >1, which will leave some unused
+    # images from previous steps in `predictions`. Here we check if we are doing
+    # eval at a new global step.
+    if results['global_step'] > self.eval_global_step:
+      self.eval_global_step = results['global_step']
+      self.predictions.clear()
+
+    for i, sid in enumerate(source_id):
+      self.predictions[int(sid)] = {
+          ssd_constants.PRED_BOXES: pred_boxes[i],
+          ssd_constants.PRED_SCORES: pred_scores[i],
+          ssd_constants.SOURCE_ID: source_id[i],
+          ssd_constants.RAW_SHAPE: raw_shape[i]
+      }
+
+    # COCO metric calculates mAP only after a full epoch of evaluation. Return
+    # dummy results for top_N_accuracy to be compatible with benchmar_cnn.py.
+    if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES:
+      log_fn('Got results for all {:d} eval examples. Calculate mAP...'.format(
+          ssd_constants.COCO_NUM_VAL_IMAGES))
+
+      annotation_file = os.path.join(self.params.data_dir,
+                                     ssd_constants.ANNOTATION_FILE)
+      # Size of predictions before decoding about 15--30GB, while size after
+      # decoding is 100--200MB. When using async eval mode, decoding takes
+      # 20--30 seconds of main thread time but is necessary to avoid OOM during
+      # inter-process communication.
+      decoded_preds = coco_metric.decode_predictions(self.predictions.values())
+      self.predictions.clear()
+
+      if self.params.collect_eval_results_async:
+        def _eval_results_getter():
+          """Iteratively get eval results from async eval process."""
+          while True:
+            step, eval_results = self.async_eval_results_queue.get()
+            self.eval_coco_ap = eval_results['COCO/AP']
+            mlperf.logger.log_eval_accuracy(
+                self.eval_coco_ap, step, self.batch_size * self.params.num_gpus,
+                ssd_constants.COCO_NUM_TRAIN_IMAGES)
+            if self.reached_target():
+              # Reached target, clear all pending messages in predictions queue
+              # and insert poison pill to stop the async eval process.
+              while not self.async_eval_predictions_queue.empty():
+                self.async_eval_predictions_queue.get()
+              self.async_eval_predictions_queue.put('STOP')
+              break
+
+        if not self.async_eval_process:
+          # Limiting the number of messages in predictions queue to prevent OOM.
+          # Each message (predictions data) can potentially consume a lot of
+          # memory, and normally there should only be few messages in the queue.
+          # If often blocked on this, consider reducing eval frequency.
+          self.async_eval_predictions_queue = multiprocessing.Queue(2)
+          self.async_eval_results_queue = multiprocessing.Queue()
+
+          # Reason to use a Process as opposed to Thread is mainly the
+          # computationally intensive eval runner. Python multithreading is not
+          # truly running in parallel, a runner thread would get significantly
+          # delayed (or alternatively delay the main thread).
+          self.async_eval_process = multiprocessing.Process(
+              target=coco_metric.async_eval_runner,
+              args=(self.async_eval_predictions_queue,
+                    self.async_eval_results_queue,
+                    annotation_file))
+          self.async_eval_process.daemon = True
+          self.async_eval_process.start()
+
+          self.async_eval_results_getter_thread = threading.Thread(
+              target=_eval_results_getter, args=())
+          self.async_eval_results_getter_thread.daemon = True
+          self.async_eval_results_getter_thread.start()
+
+        self.async_eval_predictions_queue.put(
+            (self.eval_global_step, decoded_preds))
+        return {'top_1_accuracy': 0, 'top_5_accuracy': 0.}
+
+      eval_results = coco_metric.compute_map(decoded_preds, annotation_file)
+      self.eval_coco_ap = eval_results['COCO/AP']
+      ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
+      for metric_key, metric_value in eval_results.items():
+        ret[constants.SIMPLE_VALUE_RESULT_PREFIX + metric_key] = metric_value
+      mlperf.logger.log_eval_accuracy(self.eval_coco_ap, self.eval_global_step,
+                                      self.batch_size * self.params.num_gpus,
+                                      ssd_constants.COCO_NUM_TRAIN_IMAGES)
+      return ret
+    log_fn('Got {:d} out of {:d} eval examples.'
+           ' Waiting for the remaining to calculate mAP...'.format(
+               len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES))
+    return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
+
+  def get_synthetic_inputs(self, input_name, nclass):
+    """Generating synthetic data matching real data shape and type."""
+    inputs = tf.random_uniform(
+        self.get_input_shapes('train')[0], dtype=self.data_type)
+    inputs = variables.VariableV1(inputs, trainable=False,
+                                  collections=[tf.GraphKeys.LOCAL_VARIABLES],
+                                  name=input_name)
+    boxes = tf.random_uniform(
+        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 4], dtype=tf.float32)
+    classes = tf.random_uniform(
+        [self.batch_size, ssd_constants.NUM_SSD_BOXES, 1], dtype=tf.float32)
+    nboxes = tf.random_uniform(
+        [self.batch_size], minval=1, maxval=10, dtype=tf.float32)
+    return (inputs, boxes, classes, nboxes)
+
+  def reached_target(self):
+    return (self.params.stop_at_top_1_accuracy and
+            self.eval_coco_ap >= self.params.stop_at_top_1_accuracy)
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/trivial_model.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/trivial_model.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/vgg_model.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/models/vgg_model.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/test_data/__init__.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/test_data/__init__.py
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/platforms/default/__init__.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/platforms/default/__init__.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/platforms/default/util.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/platforms/default/util.py
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/platforms/util.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/platforms/util.py
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/preprocessing.py
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/preprocessing.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Image pre-processing utilities.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow.compat.v1 as tf
+
+# pylint: disable=g-direct-tensorflow-import
+import cnn_util
+from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.framework import function
+from tensorflow.python.layers import utils
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.platform import gfile
+import mlperf
+
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields:
+
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>
+
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+
+  Returns:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    text: Tensor tf.string containing the human-readable label.
+  """
+  # Dense features in Example proto.
+  feature_map = {
+      'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
+                                          default_value=''),
+      'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64,
+                                              default_value=-1),
+      'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
+                                             default_value=''),
+  }
+  sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+      {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                   'image/object/bbox/ymin',
+                                   'image/object/bbox/xmax',
+                                   'image/object/bbox/ymax']})
+
+  features = tf.parse_single_example(example_serialized, feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(bbox, [0, 2, 1])
+
+  return features['image/encoded'], label, bbox, features['image/class/text']
+
+
+_RESIZE_METHOD_MAP = {
+    'nearest': tf.image.ResizeMethod.NEAREST_NEIGHBOR,
+    'bilinear': tf.image.ResizeMethod.BILINEAR,
+    'bicubic': tf.image.ResizeMethod.BICUBIC,
+    'area': tf.image.ResizeMethod.AREA
+}
+
+
+def get_image_resize_method(resize_method, batch_position=0):
+  """Get tensorflow resize method.
+
+  If resize_method is 'round_robin', return different methods based on batch
+  position in a round-robin fashion. NOTE: If the batch size is not a multiple
+  of the number of methods, then the distribution of methods will not be
+  uniform.
+
+  Args:
+    resize_method: (string) nearest, bilinear, bicubic, area, or round_robin.
+    batch_position: position of the image in a batch. NOTE: this argument can
+      be an integer or a tensor
+  Returns:
+    one of resize type defined in tf.image.ResizeMethod.
+  """
+
+  if resize_method != 'round_robin':
+    return _RESIZE_METHOD_MAP[resize_method]
+
+  # return a resize method based on batch position in a round-robin fashion.
+  resize_methods = list(_RESIZE_METHOD_MAP.values())
+  def lookup(index):
+    return resize_methods[index]
+
+  def resize_method_0():
+    return utils.smart_cond(batch_position % len(resize_methods) == 0,
+                            lambda: lookup(0), resize_method_1)
+
+  def resize_method_1():
+    return utils.smart_cond(batch_position % len(resize_methods) == 1,
+                            lambda: lookup(1), resize_method_2)
+
+  def resize_method_2():
+    return utils.smart_cond(batch_position % len(resize_methods) == 2,
+                            lambda: lookup(2), lambda: lookup(3))
+
+  # NOTE(jsimsa): Unfortunately, we cannot use a single recursive function here
+  # because TF would not be able to construct a finite graph.
+
+  return resize_method_0()
+
+
+def decode_jpeg(image_buffer, scope=None):  # , dtype=tf.float32):
+  """Decode a JPEG string into one 3-D float image Tensor.
+
+  Args:
+    image_buffer: scalar string Tensor.
+    scope: Optional scope for op_scope.
+  Returns:
+    3-D float Tensor with values ranging from [0, 1).
+  """
+  # with tf.op_scope([image_buffer], scope, 'decode_jpeg'):
+  # with tf.name_scope(scope, 'decode_jpeg', [image_buffer]):
+  with tf.name_scope(scope or 'decode_jpeg'):
+    # Decode the string as an RGB JPEG.
+    # Note that the resulting image contains an unknown height and width
+    # that is set dynamically by decode_jpeg. In other words, the height
+    # and width of image is unknown at compile-time.
+    image = tf.image.decode_jpeg(image_buffer, channels=3,
+                                 fancy_upscaling=False,
+                                 dct_method='INTEGER_FAST')
+
+    # image = tf.Print(image, [tf.shape(image)], 'Image shape: ')
+
+    return image
+
+
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+
+
+def normalized_image(images):
+  # Rescale from [0, 255] to [0, 2]
+  images = tf.multiply(images, 1. / 127.5)
+  # Rescale to [-1, 1]
+  mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION, value=[1.0] * 3)
+  return tf.subtract(images, 1.0)
+
+
+def eval_image(image,
+               height,
+               width,
+               batch_position,
+               resize_method,
+               summary_verbosity=0):
+  """Get the image for model evaluation.
+
+  We preprocess the image simiarly to Slim, see
+  https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/vgg_preprocessing.py
+  Validation images do not have bounding boxes, so to crop the image, we first
+  resize the image such that the aspect ratio is maintained and the resized
+  height and width are both at least 1.145 times `height` and `width`
+  respectively. Then, we do a central crop to size (`height`, `width`).
+
+  Args:
+    image: 3-D float Tensor representing the image.
+    height: The height of the image that will be returned.
+    width: The width of the image that will be returned.
+    batch_position: position of the image in a batch, which affects how images
+      are distorted and resized. NOTE: this argument can be an integer or a
+      tensor
+    resize_method: one of the strings 'round_robin', 'nearest', 'bilinear',
+      'bicubic', or 'area'.
+    summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both
+      summaries and checkpoints.
+  Returns:
+    An image of size (output_height, output_width, 3) that is resized and
+    cropped as described above.
+  """
+  # TODO(reedwm): Currently we resize then crop. Investigate if it's faster to
+  # crop then resize.
+  with tf.name_scope('eval_image'):
+    if summary_verbosity >= 3:
+      tf.summary.image(
+          'original_image', tf.expand_dims(image, 0))
+
+    shape = tf.shape(image)
+    image_height = shape[0]
+    image_width = shape[1]
+    image_height_float = tf.cast(image_height, tf.float32)
+    image_width_float = tf.cast(image_width, tf.float32)
+
+    # This value is chosen so that in resnet, images are cropped to a size of
+    # 256 x 256, which matches what other implementations do. The final image
+    # size for resnet is 224 x 224, and floor(224 * 1.145) = 256.
+    scale_factor = 1.145
+
+    # Compute resize_height and resize_width to be the minimum values such that
+    #   1. The aspect ratio is maintained (i.e. resize_height / resize_width is
+    #      image_height / image_width), and
+    #   2. resize_height >= height * `scale_factor`, and
+    #   3. resize_width >= width * `scale_factor`
+    max_ratio = tf.maximum(height / image_height_float,
+                           width / image_width_float)
+    resize_height = tf.cast(image_height_float * max_ratio * scale_factor,
+                            tf.int32)
+    resize_width = tf.cast(image_width_float * max_ratio * scale_factor,
+                           tf.int32)
+    mlperf.logger.log_input_resize_aspect_preserving(height, width,
+                                                     scale_factor)
+
+    # Resize the image to shape (`resize_height`, `resize_width`)
+    image_resize_method = get_image_resize_method(resize_method, batch_position)
+    distorted_image = tf.image.resize_images(image,
+                                             [resize_height, resize_width],
+                                             image_resize_method,
+                                             align_corners=False)
+
+    # Do a central crop of the image to size (height, width).
+    # MLPerf requires us to log (height, width) with two different keys.
+    mlperf.logger.log(key=mlperf.tags.INPUT_CENTRAL_CROP, value=[height, width])
+    mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width])
+    total_crop_height = (resize_height - height)
+    crop_top = total_crop_height // 2
+    total_crop_width = (resize_width - width)
+    crop_left = total_crop_width // 2
+    distorted_image = tf.slice(distorted_image, [crop_top, crop_left, 0],
+                               [height, width, 3])
+
+    distorted_image.set_shape([height, width, 3])
+    if summary_verbosity >= 3:
+      tf.summary.image(
+          'cropped_resized_image', tf.expand_dims(distorted_image, 0))
+    image = distorted_image
+  return image
+
+
+def train_image(image_buffer,
+                height,
+                width,
+                bbox,
+                batch_position,
+                resize_method,
+                distortions,
+                scope=None,
+                summary_verbosity=0,
+                distort_color_in_yiq=False,
+                fuse_decode_and_crop=False):
+  """Distort one image for training a network.
+
+  Distorting images provides a useful technique for augmenting the data
+  set during training in order to make the network invariant to aspects
+  of the image that do not effect the label.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    height: integer
+    width: integer
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged
+      as [ymin, xmin, ymax, xmax].
+    batch_position: position of the image in a batch, which affects how images
+      are distorted and resized. NOTE: this argument can be an integer or a
+      tensor
+    resize_method: round_robin, nearest, bilinear, bicubic, or area.
+    distortions: If true, apply full distortions for image colors.
+    scope: Optional scope for op_scope.
+    summary_verbosity: Verbosity level for summary ops. Pass 0 to disable both
+      summaries and checkpoints.
+    distort_color_in_yiq: distort color of input images in YIQ space.
+    fuse_decode_and_crop: fuse the decode/crop operation.
+  Returns:
+    3-D float Tensor of distorted image used for training.
+  """
+  # with tf.op_scope([image, height, width, bbox], scope, 'distort_image'):
+  # with tf.name_scope(scope, 'distort_image', [image, height, width, bbox]):
+  with tf.name_scope(scope or 'distort_image'):
+    # A large fraction of image datasets contain a human-annotated bounding box
+    # delineating the region of the image containing the object of interest.  We
+    # choose to create a new bounding box for the object which is a randomly
+    # distorted version of the human-annotated bounding box that obeys an
+    # allowed range of aspect ratios, sizes and overlap with the human-annotated
+    # bounding box. If no box is supplied, then we assume the bounding box is
+    # the entire image.
+    min_object_covered = 0.1
+    aspect_ratio_range = [0.75, 1.33]
+    area_range = [0.05, 1.0]
+    max_attempts = 100
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MIN_OBJ_COV,
+                      value=min_object_covered)
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_RATIO_RANGE,
+                      value=aspect_ratio_range)
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_AREA_RANGE,
+                      value=area_range)
+    mlperf.logger.log(key=mlperf.tags.INPUT_DISTORTED_CROP_MAX_ATTEMPTS,
+                      value=max_attempts)
+
+    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+        tf.image.extract_jpeg_shape(image_buffer),
+        bounding_boxes=bbox,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=True)
+    bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
+    if summary_verbosity >= 3:
+      image = tf.image.decode_jpeg(image_buffer, channels=3,
+                                   dct_method='INTEGER_FAST')
+      image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+      image_with_distorted_box = tf.image.draw_bounding_boxes(
+          tf.expand_dims(image, 0), distort_bbox)
+      tf.summary.image(
+          'images_with_distorted_bounding_box',
+          image_with_distorted_box)
+
+    # Crop the image to the specified bounding box.
+    if fuse_decode_and_crop:
+      offset_y, offset_x, _ = tf.unstack(bbox_begin)
+      target_height, target_width, _ = tf.unstack(bbox_size)
+      crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+      image = tf.image.decode_and_crop_jpeg(
+          image_buffer, crop_window, channels=3)
+    else:
+      image = tf.image.decode_jpeg(image_buffer, channels=3,
+                                   dct_method='INTEGER_FAST')
+      image = tf.slice(image, bbox_begin, bbox_size)
+
+    mlperf.logger.log(key=mlperf.tags.INPUT_RANDOM_FLIP)
+    distorted_image = tf.image.random_flip_left_right(image)
+
+    # This resizing operation may distort the images because the aspect
+    # ratio is not respected.
+    mlperf.logger.log(key=mlperf.tags.INPUT_RESIZE, value=[height, width])
+    image_resize_method = get_image_resize_method(resize_method, batch_position)
+    distorted_image = tf.image.resize_images(
+        distorted_image, [height, width],
+        image_resize_method,
+        align_corners=False)
+    # Restore the shape since the dynamic slice based upon the bbox_size loses
+    # the third dimension.
+    distorted_image.set_shape([height, width, 3])
+    if summary_verbosity >= 3:
+      tf.summary.image('cropped_resized_maybe_flipped_image',
+                       tf.expand_dims(distorted_image, 0))
+
+    if distortions:
+      distorted_image = tf.cast(distorted_image, dtype=tf.float32)
+      # Images values are expected to be in [0,1] for color distortion.
+      distorted_image /= 255.
+      # Randomly distort the colors.
+      distorted_image = distort_color(distorted_image, batch_position,
+                                      distort_color_in_yiq=distort_color_in_yiq)
+
+      # Note: This ensures the scaling matches the output of eval_image
+      distorted_image *= 255
+
+    if summary_verbosity >= 3:
+      tf.summary.image(
+          'final_distorted_image',
+          tf.expand_dims(distorted_image, 0))
+    return distorted_image
+
+
+def distort_color(image, batch_position=0, distort_color_in_yiq=False,
+                  scope=None):
+  """Distort the color of the image.
+
+  Each color distortion is non-commutative and thus ordering of the color ops
+  matters. Ideally we would randomly permute the ordering of the color ops.
+  Rather then adding that level of complication, we select a distinct ordering
+  of color ops based on the position of the image in a batch.
+
+  Args:
+    image: float32 Tensor containing single image. Tensor values should be in
+      range [0, 1].
+    batch_position: the position of the image in a batch. NOTE: this argument
+      can be an integer or a tensor
+    distort_color_in_yiq: distort color of input images in YIQ space.
+    scope: Optional scope for op_scope.
+  Returns:
+    color-distorted image
+  """
+  if distort_color_in_yiq:
+    try:
+      from tensorflow.contrib.image.python.ops import distort_image_ops  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ValueError(
+          'In TF2, you cannot pass --distortions unless you also pass '
+          '--nodistort_color_in_yiq. This is because the random_hsv_in_yiq was '
+          'removed in TF2. --distortions does not improve accuracy on resnet '
+          'so it is not recommended. --nodistort_color_in_yiq also has no '
+          'impact on accuracy, but may hurt performance.')
+
+  with tf.name_scope(scope or 'distort_color'):
+
+    def distort_fn_0(image=image):
+      """Variant 0 of distort function."""
+      image = tf.image.random_brightness(image, max_delta=32. / 255.)
+      if distort_color_in_yiq:
+        image = distort_image_ops.random_hsv_in_yiq(
+            image, lower_saturation=0.5, upper_saturation=1.5,
+            max_delta_hue=0.2 * math.pi)
+      else:
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_hue(image, max_delta=0.2)
+      image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+      return image
+
+    def distort_fn_1(image=image):
+      """Variant 1 of distort function."""
+      image = tf.image.random_brightness(image, max_delta=32. / 255.)
+      image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
+      if distort_color_in_yiq:
+        image = distort_image_ops.random_hsv_in_yiq(
+            image, lower_saturation=0.5, upper_saturation=1.5,
+            max_delta_hue=0.2 * math.pi)
+      else:
+        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
+        image = tf.image.random_hue(image, max_delta=0.2)
+      return image
+
+    image = utils.smart_cond(batch_position % 2 == 0, distort_fn_0,
+                             distort_fn_1)
+    # The random_* ops do not necessarily clamp.
+    image = tf.clip_by_value(image, 0.0, 1.0)
+    return image
+
+
+class InputPreprocessor(object):
+  """Base class for all model preprocessors."""
+
+  def __init__(self, batch_size, output_shapes):
+    self.batch_size = batch_size
+    self.output_shapes = output_shapes
+
+  def supports_datasets(self):
+    """Whether this preprocessor supports dataset."""
+    return False
+
+  def minibatch(self, dataset, subset, params, shift_ratio=-1):
+    """Returns tensors representing a minibatch of all the input."""
+    raise NotImplementedError('Must be implemented by subclass.')
+
+  # The methods added below are only supported/used if supports_datasets()
+  # returns True.
+  # TODO(laigd): refactor benchmark_cnn.py and put the logic of
+  # _build_input_processing() into InputPreprocessor.
+
+  def parse_and_preprocess(self, value, batch_position):
+    """Function to parse and preprocess an Example proto in input pipeline."""
+    raise NotImplementedError('Must be implemented by subclass.')
+
+  # TODO(laigd): figure out how to remove these parameters, since the
+  # preprocessor itself has self.batch_size, self.num_splits, etc defined.
+  def build_multi_device_iterator(self, batch_size, num_splits, cpu_device,
+                                  params, gpu_devices, dataset, doing_eval):
+    """Creates a MultiDeviceIterator."""
+    assert self.supports_datasets()
+    assert num_splits == len(gpu_devices)
+    with tf.name_scope('batch_processing'):
+      if doing_eval:
+        subset = 'validation'
+      else:
+        subset = 'train'
+      batch_size_per_split = batch_size // num_splits
+      ds = self.create_dataset(
+          batch_size,
+          num_splits,
+          batch_size_per_split,
+          dataset,
+          subset,
+          train=(not doing_eval),
+          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+          num_threads=params.datasets_num_private_threads,
+          datasets_use_caching=params.datasets_use_caching,
+          datasets_parallel_interleave_cycle_length=(
+              params.datasets_parallel_interleave_cycle_length),
+          datasets_sloppy_parallel_interleave=(
+              params.datasets_sloppy_parallel_interleave),
+          datasets_parallel_interleave_prefetch=(
+              params.datasets_parallel_interleave_prefetch))
+      multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          ds,
+          gpu_devices,
+          source_device=cpu_device,
+          max_buffer_size=params.multi_device_iterator_max_buffer_size)
+      tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
+                           multi_device_iterator.initializer)
+      return multi_device_iterator
+
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    raise NotImplementedError('Must be implemented by subclass.')
+
+  def create_iterator(self, ds):
+    ds_iterator = tf.data.make_initializable_iterator(ds)
+    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
+                         ds_iterator.initializer)
+    return ds_iterator
+
+  def minibatch_fn(self, batch_size, model_input_shapes, num_splits,
+                   dataset, subset, train, datasets_repeat_cached_sample,
+                   num_threads, datasets_use_caching,
+                   datasets_parallel_interleave_cycle_length,
+                   datasets_sloppy_parallel_interleave,
+                   datasets_parallel_interleave_prefetch):
+    """Returns a function and list of args for the fn to create a minibatch."""
+    assert self.supports_datasets()
+    batch_size_per_split = batch_size // num_splits
+    assert batch_size_per_split == model_input_shapes[0][0]
+    with tf.name_scope('batch_processing'):
+      ds = self.create_dataset(batch_size, num_splits, batch_size_per_split,
+                               dataset, subset, train,
+                               datasets_repeat_cached_sample, num_threads,
+                               datasets_use_caching,
+                               datasets_parallel_interleave_cycle_length,
+                               datasets_sloppy_parallel_interleave,
+                               datasets_parallel_interleave_prefetch)
+      ds_iterator = self.create_iterator(ds)
+
+      ds_iterator_string_handle = ds_iterator.string_handle()
+
+      @function.Defun(tf.string)
+      def _fn(h):
+        remote_iterator = tf.data.Iterator.from_string_handle(
+            h, ds_iterator.output_types, ds_iterator.output_shapes)
+        input_list = remote_iterator.get_next()
+        reshaped_input_list = [
+            tf.reshape(input_list[i], shape=model_input_shapes[i])
+            for i in range(len(input_list))
+        ]
+        return reshaped_input_list
+
+      return _fn, [ds_iterator_string_handle]
+
+
+class BaseImagePreprocessor(InputPreprocessor):
+  """Base class for all image model preprocessors."""
+
+  def __init__(self,
+               batch_size,
+               output_shapes,
+               num_splits,
+               dtype,
+               train,
+               distortions,
+               resize_method,
+               shift_ratio=-1,
+               summary_verbosity=0,
+               distort_color_in_yiq=True,
+               fuse_decode_and_crop=True,
+               match_mlperf=False):
+    super(BaseImagePreprocessor, self).__init__(batch_size, output_shapes)
+    image_shape = output_shapes[0]
+    # image_shape is in form (batch_size, height, width, depth)
+    self.height = image_shape[1]
+    self.width = image_shape[2]
+    self.depth = image_shape[3]
+    self.num_splits = num_splits
+    self.dtype = dtype
+    self.train = train
+    self.resize_method = resize_method
+    self.shift_ratio = shift_ratio
+    self.distortions = distortions
+    self.distort_color_in_yiq = distort_color_in_yiq
+    self.fuse_decode_and_crop = fuse_decode_and_crop
+    if self.batch_size % self.num_splits != 0:
+      raise ValueError(
+          ('batch_size must be a multiple of num_splits: '
+           'batch_size %d, num_splits: %d') %
+          (self.batch_size, self.num_splits))
+    self.batch_size_per_split = self.batch_size // self.num_splits
+    self.summary_verbosity = summary_verbosity
+    self.match_mlperf = match_mlperf
+
+  def parse_and_preprocess(self, value, batch_position):
+    assert self.supports_datasets()
+    image_buffer, label_index, bbox, _ = parse_example_proto(value)
+    if self.match_mlperf:
+      bbox = tf.zeros((1, 0, 4), dtype=bbox.dtype)
+      mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=False)
+    else:
+      mlperf.logger.log(key=mlperf.tags.INPUT_CROP_USES_BBOXES, value=True)
+    image = self.preprocess(image_buffer, bbox, batch_position)
+    return (image, label_index)
+
+  def preprocess(self, image_buffer, bbox, batch_position):
+    raise NotImplementedError('Must be implemented by subclass.')
+
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    assert self.supports_datasets()
+    glob_pattern = dataset.tf_record_pattern(subset)
+    file_names = gfile.Glob(glob_pattern)
+    if not file_names:
+      raise ValueError('Found no files in --data_dir matching: {}'
+                       .format(glob_pattern))
+    ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train)
+    ds = ds.apply(
+        tf.data.experimental.parallel_interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=datasets_parallel_interleave_cycle_length or 10,
+            sloppy=datasets_sloppy_parallel_interleave,
+            prefetch_input_elements=datasets_parallel_interleave_prefetch))
+    if datasets_repeat_cached_sample:
+      # Repeat a single sample element indefinitely to emulate memory-speed IO.
+      ds = ds.take(1).cache().repeat()
+    counter = tf.data.Dataset.range(batch_size)
+    counter = counter.repeat()
+    ds = tf.data.Dataset.zip((ds, counter))
+    ds = ds.prefetch(buffer_size=batch_size)
+    if datasets_use_caching:
+      ds = ds.cache()
+    if train:
+      buffer_size = 10000
+      mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=buffer_size)
+      ds = ds.apply(
+          tf.data.experimental.shuffle_and_repeat(buffer_size=buffer_size))
+    else:
+      ds = ds.repeat()
+    ds = ds.apply(
+        tf.data.experimental.map_and_batch(
+            map_func=self.parse_and_preprocess,
+            batch_size=batch_size_per_split,
+            num_parallel_batches=num_splits))
+    ds = ds.prefetch(buffer_size=num_splits)
+    if num_threads:
+      options = tf.data.Options()
+      options.experimental_threading.private_threadpool_size = num_threads
+      ds = ds.with_options(options)
+    return ds
+
+
+class RecordInputImagePreprocessor(BaseImagePreprocessor):
+  """Preprocessor for images with RecordInput format."""
+
+  def preprocess(self, image_buffer, bbox, batch_position):
+    """Preprocessing image_buffer as a function of its batch position."""
+    if self.train:
+      image = train_image(image_buffer, self.height, self.width, bbox,
+                          batch_position, self.resize_method, self.distortions,
+                          None, summary_verbosity=self.summary_verbosity,
+                          distort_color_in_yiq=self.distort_color_in_yiq,
+                          fuse_decode_and_crop=self.fuse_decode_and_crop)
+    else:
+      image = tf.image.decode_jpeg(
+          image_buffer, channels=3, dct_method='INTEGER_FAST')
+      image = eval_image(image, self.height, self.width, batch_position,
+                         self.resize_method,
+                         summary_verbosity=self.summary_verbosity)
+    # Note: image is now float32 [height,width,3] with range [0, 255]
+
+    # image = tf.cast(image, tf.uint8) # HACK TESTING
+
+    if self.match_mlperf:
+      mlperf.logger.log(key=mlperf.tags.INPUT_MEAN_SUBTRACTION,
+                        value=_CHANNEL_MEANS)
+      normalized = image - _CHANNEL_MEANS
+    else:
+      normalized = normalized_image(image)
+    return tf.cast(normalized, self.dtype)
+
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=-1):
+    if shift_ratio < 0:
+      shift_ratio = self.shift_ratio
+    with tf.name_scope('batch_processing'):
+      # Build final results per split.
+      images = [[] for _ in range(self.num_splits)]
+      labels = [[] for _ in range(self.num_splits)]
+      if params.use_datasets:
+        ds = self.create_dataset(
+            self.batch_size, self.num_splits, self.batch_size_per_split,
+            dataset, subset, self.train,
+            datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+            num_threads=params.datasets_num_private_threads,
+            datasets_use_caching=params.datasets_use_caching,
+            datasets_parallel_interleave_cycle_length=(
+                params.datasets_parallel_interleave_cycle_length),
+            datasets_sloppy_parallel_interleave=(
+                params.datasets_sloppy_parallel_interleave),
+            datasets_parallel_interleave_prefetch=(
+                params.datasets_parallel_interleave_prefetch))
+        ds_iterator = self.create_iterator(ds)
+        for d in xrange(self.num_splits):
+          images[d], labels[d] = ds_iterator.get_next()
+
+      # TODO(laigd): consider removing the --use_datasets option, it should
+      # always use datasets.
+      else:
+        record_input = data_flow_ops.RecordInput(
+            file_pattern=dataset.tf_record_pattern(subset),
+            seed=301,
+            parallelism=64,
+            buffer_size=10000,
+            batch_size=self.batch_size,
+            shift_ratio=shift_ratio,
+            name='record_input')
+        records = record_input.get_yield_op()
+        records = tf.split(records, self.batch_size, 0)
+        records = [tf.reshape(record, []) for record in records]
+        for idx in xrange(self.batch_size):
+          value = records[idx]
+          (image, label) = self.parse_and_preprocess(value, idx)
+          split_index = idx % self.num_splits
+          labels[split_index].append(label)
+          images[split_index].append(image)
+
+      for split_index in xrange(self.num_splits):
+        if not params.use_datasets:
+          images[split_index] = tf.parallel_stack(images[split_index])
+          labels[split_index] = tf.concat(labels[split_index], 0)
+        images[split_index] = tf.reshape(
+            images[split_index],
+            shape=[self.batch_size_per_split, self.height, self.width,
+                   self.depth])
+        labels[split_index] = tf.reshape(labels[split_index],
+                                         [self.batch_size_per_split])
+      return images, labels
+
+  def supports_datasets(self):
+    return True
+
+
+class ImagenetPreprocessor(RecordInputImagePreprocessor):
+
+  def preprocess(self, image_buffer, bbox, batch_position):
+    # pylint: disable=g-import-not-at-top
+    try:
+      from official.r1.resnet.imagenet_preprocessing import preprocess_image
+    except ImportError:
+      tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH.')
+      raise
+    if self.train:
+      image = preprocess_image(
+          image_buffer, bbox, self.height, self.width, self.depth,
+          is_training=True)
+    else:
+      image = preprocess_image(
+          image_buffer, bbox, self.height, self.width, self.depth,
+          is_training=False)
+    return tf.cast(image, self.dtype)
+
+
+class Cifar10ImagePreprocessor(BaseImagePreprocessor):
+  """Preprocessor for Cifar10 input images."""
+
+  def _distort_image(self, image):
+    """Distort one image for training a network.
+
+    Adopted the standard data augmentation scheme that is widely used for
+    this dataset: the images are first zero-padded with 4 pixels on each side,
+    then randomly cropped to again produce distorted images; half of the images
+    are then horizontally mirrored.
+
+    Args:
+      image: input image.
+    Returns:
+      distorted image.
+    """
+    image = tf.image.resize_image_with_crop_or_pad(
+        image, self.height + 8, self.width + 8)
+    distorted_image = tf.random_crop(image,
+                                     [self.height, self.width, self.depth])
+    # Randomly flip the image horizontally.
+    distorted_image = tf.image.random_flip_left_right(distorted_image)
+    if self.summary_verbosity >= 3:
+      tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0))
+    return distorted_image
+
+  def _eval_image(self, image):
+    """Get the image for model evaluation."""
+    distorted_image = tf.image.resize_image_with_crop_or_pad(
+        image, self.width, self.height)
+    if self.summary_verbosity >= 3:
+      tf.summary.image('cropped.image', tf.expand_dims(distorted_image, 0))
+    return distorted_image
+
+  def preprocess(self, raw_image):
+    """Preprocessing raw image."""
+    if self.summary_verbosity >= 3:
+      tf.summary.image('raw.image', tf.expand_dims(raw_image, 0))
+    if self.train and self.distortions:
+      image = self._distort_image(raw_image)
+    else:
+      image = self._eval_image(raw_image)
+    normalized = normalized_image(image)
+    return tf.cast(normalized, self.dtype)
+
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=-1):
+    # TODO(jsimsa): Implement datasets code path
+    del shift_ratio, params
+    with tf.name_scope('batch_processing'):
+      all_images, all_labels = dataset.read_data_files(subset)
+      all_images = tf.constant(all_images)
+      all_labels = tf.constant(all_labels)
+      input_image, input_label = tf.train.slice_input_producer(
+          [all_images, all_labels])
+      input_image = tf.cast(input_image, self.dtype)
+      input_label = tf.cast(input_label, tf.int32)
+      # Ensure that the random shuffling has good mixing properties.
+      min_fraction_of_examples_in_queue = 0.4
+      min_queue_examples = int(dataset.num_examples_per_epoch(subset) *
+                               min_fraction_of_examples_in_queue)
+      raw_images, raw_labels = tf.train.shuffle_batch(
+          [input_image, input_label], batch_size=self.batch_size,
+          capacity=min_queue_examples + 3 * self.batch_size,
+          min_after_dequeue=min_queue_examples)
+
+      images = [[] for i in range(self.num_splits)]
+      labels = [[] for i in range(self.num_splits)]
+
+      # Create a list of size batch_size, each containing one image of the
+      # batch. Without the unstack call, raw_images[i] would still access the
+      # same image via a strided_slice op, but would be slower.
+      raw_images = tf.unstack(raw_images, axis=0)
+      raw_labels = tf.unstack(raw_labels, axis=0)
+      for i in xrange(self.batch_size):
+        split_index = i % self.num_splits
+        # The raw image read from data has the format [depth, height, width]
+        # reshape to the format returned by minibatch.
+        raw_image = tf.reshape(raw_images[i],
+                               [dataset.depth, dataset.height, dataset.width])
+        raw_image = tf.transpose(raw_image, [1, 2, 0])
+        image = self.preprocess(raw_image)
+        images[split_index].append(image)
+
+        labels[split_index].append(raw_labels[i])
+
+      for split_index in xrange(self.num_splits):
+        images[split_index] = tf.parallel_stack(images[split_index])
+        labels[split_index] = tf.parallel_stack(labels[split_index])
+      return images, labels
+
+
+class COCOPreprocessor(BaseImagePreprocessor):
+  """Preprocessor for COCO dataset input images, boxes, and labels."""
+
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=-1):
+    del shift_ratio  # Not used when using datasets instead of data_flow_ops
+    with tf.name_scope('batch_processing'):
+      ds = self.create_dataset(
+          batch_size=self.batch_size,
+          num_splits=self.num_splits,
+          batch_size_per_split=self.batch_size_per_split,
+          dataset=dataset,
+          subset=subset,
+          train=self.train,
+          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+          num_threads=params.datasets_num_private_threads,
+          datasets_use_caching=params.datasets_use_caching,
+          datasets_parallel_interleave_cycle_length=(
+              params.datasets_parallel_interleave_cycle_length),
+          datasets_sloppy_parallel_interleave=(
+              params.datasets_sloppy_parallel_interleave),
+          datasets_parallel_interleave_prefetch=(
+              params.datasets_parallel_interleave_prefetch))
+      ds_iterator = self.create_iterator(ds)
+
+      # Training data: 4 tuple
+      # Validation data: 5 tuple
+      # See get_input_shapes in models/ssd_model.py for details.
+      input_len = 4 if subset == 'train' else 5
+      input_lists = [[None for _ in range(self.num_splits)]
+                     for _ in range(input_len)]
+      for d in xrange(self.num_splits):
+        input_list = ds_iterator.get_next()
+        for i in range(input_len):
+          input_lists[i][d] = input_list[i]
+      return input_lists
+
+  def preprocess(self, data):
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+      import ssd_constants  # pylint: disable=g-import-not-at-top
+      from object_detection.core import preprocessor  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation')
+    image_buffer = data['image_buffer']
+    boxes = data['groundtruth_boxes']
+    classes = tf.reshape(data['groundtruth_classes'], [-1, 1])
+    source_id = tf.string_to_number(data['source_id'])
+    raw_shape = data['raw_shape']
+
+    ssd_encoder = ssd_dataloader.Encoder()
+
+    # Only 80 of the 90 COCO classes are used.
+    class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
+    classes = tf.gather(class_map, classes)
+    classes = tf.cast(classes, dtype=tf.float32)
+
+    if self.train:
+      image, boxes, classes = ssd_dataloader.ssd_decode_and_crop(
+          image_buffer, boxes, classes, raw_shape)
+      # ssd_crop resizes and returns image of dtype float32 and does not change
+      # its range (i.e., value in between 0--255). Divide by 255. converts it
+      # to [0, 1] range. Not doing this before cropping to avoid dtype cast
+      # (which incurs additional memory copy).
+      image /= 255.
+
+      image, boxes = preprocessor.random_horizontal_flip(
+          image=image, boxes=boxes)
+      # Random horizontal flip probability is 50%
+      # See https://github.com/tensorflow/models/blob/master/research/object_detection/core/preprocessor.py  # pylint: disable=line-too-long
+      mlperf.logger.log(key=mlperf.tags.RANDOM_FLIP_PROBABILITY, value=0.5)
+
+      image = tf.cast(image, self.dtype)
+
+      encoded_returns = ssd_encoder.encode_labels(boxes, classes)
+      encoded_classes, encoded_boxes, num_matched_boxes = encoded_returns
+
+      # Shape of image: [width, height, channel]
+      # Shape of encoded_boxes: [NUM_SSD_BOXES, 4]
+      # Shape of encoded_classes: [NUM_SSD_BOXES, 1]
+      # Shape of num_matched_boxes: [1]
+      return (image, encoded_boxes, encoded_classes, num_matched_boxes)
+
+    else:
+      image = tf.image.decode_jpeg(image_buffer)
+      image = tf.image.resize_images(
+          image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))
+      # resize_image returns image of dtype float32 and does not change its
+      # range. Divide by 255 to convert image to [0, 1] range.
+      image /= 255.
+
+      image = ssd_dataloader.normalize_image(image)
+      image = tf.cast(image, self.dtype)
+
+      def trim_and_pad(inp_tensor):
+        """Limit the number of boxes, and pad if necessary."""
+        inp_tensor = inp_tensor[:ssd_constants.MAX_NUM_EVAL_BOXES]
+        num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0]
+        inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
+        return tf.reshape(inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES,
+                                       inp_tensor.get_shape()[1]])
+
+      boxes, classes = trim_and_pad(boxes), trim_and_pad(classes)
+
+      # Shape of boxes: [MAX_NUM_EVAL_BOXES, 4]
+      # Shape of classes: [MAX_NUM_EVAL_BOXES, 1]
+      # Shape of source_id: [] (scalar tensor)
+      # Shape of raw_shape: [3]
+      return (image, boxes, classes, source_id, raw_shape)
+
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    try:
+      import ssd_dataloader  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise ImportError('To use the COCO dataset, you must clone the '
+                        'repo https://github.com/tensorflow/models and add '
+                        'tensorflow/models and tensorflow/models/research to '
+                        'the PYTHONPATH, and compile the protobufs by '
+                        'following https://github.com/tensorflow/models/blob/'
+                        'master/research/object_detection/g3doc/installation.md'
+                        '#protobuf-compilation')
+    assert self.supports_datasets()
+
+    glob_pattern = dataset.tf_record_pattern(subset)
+    ds = tf.data.TFRecordDataset.list_files(glob_pattern, shuffle=train)
+    # TODO(haoyuzhang): Enable map+filter fusion after cl/218399112 in release
+    # options = tf.data.Options()
+    # options.experimental_optimization = tf.data.experimental.OptimizationOptions()  # pylint: disable=line-too-long
+    # options.experimental_optimization.map_and_filter_fusion = True
+    # ds = ds.with_options(options)
+
+    ds = ds.apply(
+        tf.data.experimental.parallel_interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=datasets_parallel_interleave_cycle_length or 10,
+            sloppy=datasets_sloppy_parallel_interleave))
+    mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
+    if datasets_repeat_cached_sample:
+      # Repeat a single sample element indefinitely to emulate memory-speed IO.
+      ds = ds.take(1).cache().repeat()
+    ds = ds.prefetch(buffer_size=batch_size)
+    if datasets_use_caching:
+      ds = ds.cache()
+    if train:
+      ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000))
+      mlperf.logger.log(key=mlperf.tags.INPUT_SHARD, value=10000)
+      mlperf.logger.log(key=mlperf.tags.INPUT_ORDER)
+    else:
+      ds = ds.repeat()
+
+    ds = ds.map(ssd_dataloader.ssd_parse_example_proto, num_parallel_calls=64)
+    ds = ds.filter(
+        lambda data: tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0))
+    ds = ds.apply(
+        tf.data.experimental.map_and_batch(
+            map_func=self.preprocess,
+            batch_size=batch_size_per_split,
+            num_parallel_batches=num_splits,
+            drop_remainder=train))
+    ds = ds.prefetch(buffer_size=num_splits)
+    if num_threads:
+      options = tf.data.Options()
+      options.experimental_threading.private_threadpool_size = num_threads
+      ds = ds.with_options(options)
+    return ds
+
+  def supports_datasets(self):
+    return True
+
+
+class TestImagePreprocessor(BaseImagePreprocessor):
+  """Preprocessor used for testing.
+
+  set_fake_data() sets which images and labels will be output by minibatch(),
+  and must be called before minibatch(). This allows tests to easily specify
+  a set of images to use for training, without having to create any files.
+
+  Queue runners must be started for this preprocessor to work.
+  """
+
+  def __init__(self,
+               batch_size,
+               output_shapes,
+               num_splits,
+               dtype,
+               train=None,
+               distortions=None,
+               resize_method=None,
+               shift_ratio=0,
+               summary_verbosity=0,
+               distort_color_in_yiq=False,
+               fuse_decode_and_crop=False,
+               match_mlperf=False):
+    super(TestImagePreprocessor, self).__init__(
+        batch_size, output_shapes, num_splits, dtype, train, distortions,
+        resize_method, shift_ratio, summary_verbosity=summary_verbosity,
+        distort_color_in_yiq=distort_color_in_yiq,
+        fuse_decode_and_crop=fuse_decode_and_crop, match_mlperf=match_mlperf)
+    self.expected_subset = None
+
+  def set_fake_data(self, fake_images, fake_labels):
+    assert len(fake_images.shape) == 4
+    assert len(fake_labels.shape) == 1
+    num_images = fake_images.shape[0]
+    assert num_images == fake_labels.shape[0]
+    assert num_images % self.batch_size == 0
+    self.fake_images = fake_images
+    self.fake_labels = fake_labels
+
+  def minibatch(self,
+                dataset,
+                subset,
+                params,
+                shift_ratio=0):
+    """Get test image batches."""
+    del dataset, params
+    if (not hasattr(self, 'fake_images') or
+        not hasattr(self, 'fake_labels')):
+      raise ValueError('Must call set_fake_data() before calling minibatch '
+                       'on TestImagePreprocessor')
+    if self.expected_subset is not None:
+      assert subset == self.expected_subset
+
+    shift_ratio = shift_ratio or self.shift_ratio
+    fake_images = cnn_util.roll_numpy_batches(self.fake_images, self.batch_size,
+                                              shift_ratio)
+    fake_labels = cnn_util.roll_numpy_batches(self.fake_labels, self.batch_size,
+                                              shift_ratio)
+
+    with tf.name_scope('batch_processing'):
+      image_slice, label_slice = tf.train.slice_input_producer(
+          [fake_images, fake_labels],
+          shuffle=False,
+          name='image_slice')
+      raw_images, raw_labels = tf.train.batch(
+          [image_slice, label_slice], batch_size=self.batch_size,
+          name='image_batch')
+      images = [[] for _ in range(self.num_splits)]
+      labels = [[] for _ in range(self.num_splits)]
+      for i in xrange(self.batch_size):
+        split_index = i % self.num_splits
+        raw_image = tf.cast(raw_images[i], self.dtype)
+        images[split_index].append(raw_image)
+        labels[split_index].append(raw_labels[i])
+      for split_index in xrange(self.num_splits):
+        images[split_index] = tf.parallel_stack(images[split_index])
+        labels[split_index] = tf.parallel_stack(labels[split_index])
+
+      normalized = [normalized_image(part) for part in images]
+      return [[tf.cast(part, self.dtype) for part in normalized], labels]
+
+
+class LibrispeechPreprocessor(InputPreprocessor):
+  """Preprocessor for librispeech class for all image model preprocessors."""
+
+  def __init__(self, batch_size, output_shapes, num_splits, dtype, train,
+               **kwargs):
+    del kwargs
+    super(LibrispeechPreprocessor, self).__init__(batch_size, output_shapes)
+    self.num_splits = num_splits
+    self.dtype = dtype
+    self.is_train = train
+    if self.batch_size % self.num_splits != 0:
+      raise ValueError(('batch_size must be a multiple of num_splits: '
+                        'batch_size %d, num_splits: %d') % (self.batch_size,
+                                                            self.num_splits))
+    self.batch_size_per_split = self.batch_size // self.num_splits
+
+  def create_dataset(self,
+                     batch_size,
+                     num_splits,
+                     batch_size_per_split,
+                     dataset,
+                     subset,
+                     train,
+                     datasets_repeat_cached_sample,
+                     num_threads=None,
+                     datasets_use_caching=False,
+                     datasets_parallel_interleave_cycle_length=None,
+                     datasets_sloppy_parallel_interleave=False,
+                     datasets_parallel_interleave_prefetch=None):
+    """Creates a dataset for the benchmark."""
+    # TODO(laigd): currently the only difference between this and the one in
+    # BaseImagePreprocessor is, this uses map() and padded_batch() while the
+    # latter uses tf.data.experimental.map_and_batch(). Try to merge them.
+    assert self.supports_datasets()
+    glob_pattern = dataset.tf_record_pattern(subset)
+    file_names = gfile.Glob(glob_pattern)
+    if not file_names:
+      raise ValueError('Found no files in --data_dir matching: {}'
+                       .format(glob_pattern))
+    ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=train)
+    ds = ds.apply(
+        tf.data.experimental.parallel_interleave(
+            tf.data.TFRecordDataset,
+            cycle_length=datasets_parallel_interleave_cycle_length or 10,
+            sloppy=datasets_sloppy_parallel_interleave,
+            prefetch_input_elements=datasets_parallel_interleave_prefetch))
+    if datasets_repeat_cached_sample:
+      # Repeat a single sample element indefinitely to emulate memory-speed IO.
+      ds = ds.take(1).cache().repeat()
+    counter = tf.data.Dataset.range(batch_size)
+    counter = counter.repeat()
+    ds = tf.data.Dataset.zip((ds, counter))
+    ds = ds.prefetch(buffer_size=batch_size)
+    if datasets_use_caching:
+      ds = ds.cache()
+    if train:
+      ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000))
+    else:
+      ds = ds.repeat()
+    ds = ds.map(map_func=self.parse_and_preprocess,
+                num_parallel_calls=batch_size_per_split*num_splits)
+    ds = ds.padded_batch(
+        batch_size=batch_size_per_split,
+        padded_shapes=tuple([
+            tf.TensorShape(output_shape[1:])
+            for output_shape in self.output_shapes
+        ]),
+        drop_remainder=True)
+    ds = ds.prefetch(buffer_size=num_splits)
+    if num_threads:
+      options = tf.data.Options()
+      options.experimental_threading.private_threadpool_size = num_threads
+      ds = ds.with_options(options)
+    return ds
+
+  def minibatch(self, dataset, subset, params, shift_ratio=-1):
+    assert params.use_datasets
+    # TODO(laigd): unify this with CNNModel's minibatch()
+    # TODO(laigd): in distributed mode we use shift_ratio so different workers
+    # won't work on same inputs, so we should respect that.
+    del shift_ratio
+    with tf.name_scope('batch_processing'):
+      ds = self.create_dataset(
+          self.batch_size,
+          self.num_splits,
+          self.batch_size_per_split,
+          dataset,
+          subset,
+          self.is_train,
+          datasets_repeat_cached_sample=params.datasets_repeat_cached_sample,
+          num_threads=params.datasets_num_private_threads,
+          datasets_use_caching=params.datasets_use_caching,
+          datasets_parallel_interleave_cycle_length=(
+              params.datasets_parallel_interleave_cycle_length),
+          datasets_sloppy_parallel_interleave=(
+              params.datasets_sloppy_parallel_interleave),
+          datasets_parallel_interleave_prefetch=(
+              params.datasets_parallel_interleave_prefetch))
+      ds_iterator = self.create_iterator(ds)
+
+      # The four lists are: input spectrogram feature, labels, input lengths,
+      # label lengths
+      input_lists = [[None for _ in range(self.num_splits)] for _ in range(4)]
+      for d in xrange(self.num_splits):
+        input_list = ds_iterator.get_next()
+        for i in range(4):
+          input_lists[i][d] = input_list[i]
+
+      assert self.output_shapes == [
+          input_lists[i][0].shape.as_list() for i in range(4)
+      ]
+      return tuple(input_lists)
+
+  def supports_datasets(self):
+    return True
+
+  def parse_and_preprocess(self, value, batch_position):
+    """Parse an TFRecord."""
+    del batch_position
+    assert self.supports_datasets()
+    context_features = {
+        'labels': tf.VarLenFeature(dtype=tf.int64),
+        'input_length': tf.FixedLenFeature([], dtype=tf.int64),
+        'label_length': tf.FixedLenFeature([], dtype=tf.int64),
+    }
+    sequence_features = {
+        'features': tf.FixedLenSequenceFeature([161], dtype=tf.float32)
+    }
+    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
+        serialized=value,
+        context_features=context_features,
+        sequence_features=sequence_features,
+    )
+
+    return [
+        # Input
+        tf.expand_dims(sequence_parsed['features'], axis=2),
+        # Label
+        tf.cast(
+            tf.reshape(
+                tf.sparse_tensor_to_dense(context_parsed['labels']), [-1]),
+            dtype=tf.int32),
+        # Input length
+        tf.cast(
+            tf.reshape(context_parsed['input_length'], [1]),
+            dtype=tf.int32),
+        # Label length
+        tf.cast(
+            tf.reshape(context_parsed['label_length'], [1]),
+            dtype=tf.int32),
+    ]
--- a/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/run.sh
+++ b/TensorFlow2x/ComputeVision/Classification/benchmarks-master/scripts/tf_cnn_benchmarks/run.sh
+#!/bin/bash
+source /public/home/qianyj/virtualenv/dtk21.10.1/dtk21.10.1_tf1.15/venv/bin/activate
+export ROCM_PATH=/public/home/qianyj/package/dtk-21.10.1/dtk-21.10.1
+export HIP_PATH=${ROCM_PATH}/hip
+export CPACK_INSTLL_PREFIX=$ROCM_PATH
+export AMDGPU_TARGETS="gfx900;gfx906"
+export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hip/bin:$PATH
+export LD_LIBRARY_PATH=${ROCM_PATH}/lib:${ROCM_PATH}/lib64:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=${ROCM_PATH}/hip/lib:${ROCM_PATH}/llvm/lib:$LD_LIBRARY_PATH
+export C_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/llvm/include${C_INCLUDE_PATH:+:${C_INCLUDE_PATH}}
+export CPLUS_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/llvm/include${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}
+
+export HSA_FORCE_FINE_GRAIN_PCIE=1 
+export MIOPEN_FIND_MODE=3
+
+export TF_CPP_MIN_VLOG_LEVEL=2
+
+HIP_VISIBLE_DEVICES=0,1,2,3  numactl --cpunodebind=0,1,2,3 --membind=0,1,2,3   nohup python3 tf_cnn_benchmarks.py --data_format=NCHW --batch_size=128 --model=resnet50 --save_model_steps=20000 --optimizer=momentum --variable_update=replicated  --print_training_accuracy=true  --eval_during_training_every_n_epochs=1  --nodistortions --num_gpus=4 --num_epochs=90 --weight_decay=1e-4 --data_dir=/public/software/apps/DeepLearning/Data/ImageNet-tensorflow/   --use_fp16=False --data_name=imagenet --train_dir=/public/home/qianyj/TF_test/dtk21.10.1/tf1.15/benchmarks-master/scripts/checkpoint   >logfile    2>&1  &
--- a/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/run_tests.py
+++ b/TensorFlow2x/ComputeVision/Classification/tf_cnn_benchmarks/run_tests.py