Internal change

PiperOrigin-RevId: 301643231

Internal change
PiperOrigin-RevId: 301643231
20c78a91 · A. Unique TensorFlower · b86ffb12 · 20c78a91 · 20c78a91 · 20c78a91
Commit 20c78a91 authored Mar 18, 2020 by A. Unique TensorFlower
6 changed files
--- a/official/vision/detection/executor/detection_executor.py
+++ b/official/vision/detection/executor/detection_executor.py
@@ -80,12 +80,11 @@ class DetectionDistributedExecutor(executor.DistributedExecutor):
        all_losses = loss_fn(labels, outputs)
        losses = {}
        for k, v in all_losses.items():
-          v = tf.reduce_mean(v) / strategy.num_replicas_in_sync
-          losses[k] = v
-        loss = losses['total_loss']
+          losses[k] = tf.reduce_mean(v)
+        per_replica_loss = losses['total_loss'] / strategy.num_replicas_in_sync
        _update_state(labels, outputs)

-      grads = tape.gradient(loss, trainable_variables)
+      grads = tape.gradient(per_replica_loss, trainable_variables)
      optimizer.apply_gradients(zip(grads, trainable_variables))
      return losses


--- a/official/vision/detection/modeling/base_model.py
+++ b/official/vision/detection/modeling/base_model.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 import abc
 import functools
 import re
-from absl import logging
-
 import tensorflow.compat.v2 as tf
 from official.vision.detection.modeling import checkpoint_utils
 from official.vision.detection.modeling import learning_rates
@@ -60,11 +58,10 @@ class OptimizerFactory(object):


 def _make_filter_trainable_variables_fn(frozen_variable_prefix):
-  """Creates a function for filtering trainable varialbes.
-  """
+  """Creates a function for filtering trainable varialbes."""

  def _filter_trainable_variables(variables):
-    """Filters trainable varialbes
+    """Filters trainable varialbes.

    Args:
      variables: a list of tf.Variable to be filtered.
@@ -141,8 +138,7 @@ class Model(object):
    return self._optimizer_fn(self._learning_rate)

  def make_filter_trainable_variables_fn(self):
-    """Creates a function for filtering trainable varialbes.
-    """
+    """Creates a function for filtering trainable varialbes."""
    return _make_filter_trainable_variables_fn(self._frozen_variable_prefix)

  def weight_decay_loss(self, trainable_variables):
@@ -151,8 +147,6 @@ class Model(object):
        if self._regularization_var_regex is None
        or re.match(self._regularization_var_regex, v.name)
    ]
-    logging.info('Regularization Variables: %s',
-                 [v.name for v in reg_variables])

    return self._l2_weight_decay * tf.add_n(
        [tf.nn.l2_loss(v) for v in reg_variables])

--- a/official/vision/detection/modeling/factory.py
+++ b/official/vision/detection/modeling/factory.py
@@ -15,6 +15,7 @@
 """Factory to build detection model."""


+from official.vision.detection.modeling import maskrcnn_model
 from official.vision.detection.modeling import retinanet_model


@@ -22,6 +23,8 @@ def model_generator(params):
  """Model function generator."""
  if params.type == 'retinanet':
    model_fn = retinanet_model.RetinanetModel(params)
+  elif params.type == 'mask_rcnn':
+    model_fn = maskrcnn_model.MaskrcnnModel(params)
  else:
    raise ValueError('Model %s is not supported.'% params.type)


--- a/official/vision/detection/modeling/losses.py
+++ b/official/vision/detection/modeling/losses.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from absl import logging
 import tensorflow.compat.v2 as tf


@@ -89,6 +90,8 @@ class RpnScoreLoss(object):

  def __init__(self, params):
    self._rpn_batch_size_per_im = params.rpn_batch_size_per_im
+    self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
+        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)

  def __call__(self, score_outputs, labels):
    """Computes total RPN detection loss.
@@ -129,16 +132,15 @@ class RpnScoreLoss(object):
    with tf.name_scope('rpn_score_loss'):
      mask = tf.math.logical_or(tf.math.equal(score_targets, 1),
                                tf.math.equal(score_targets, 0))
-      score_targets = tf.math.maximum(score_targets, tf.zeros_like(score_targets))
-      # RPN score loss is sum over all except ignored samples.
-      # Keep the compat.v1 loss because Keras does not have a
-      # sigmoid_cross_entropy substitution yet.
-      # TODO(b/143720144): replace this loss.
-      score_loss = tf.compat.v1.losses.sigmoid_cross_entropy(
-          score_targets,
-          score_outputs,
-          weights=mask,
-          reduction=tf.compat.v1.losses.Reduction.SUM)
+
+      score_targets = tf.math.maximum(score_targets,
+                                      tf.zeros_like(score_targets))
+
+      score_targets = tf.expand_dims(score_targets, axis=-1)
+      score_outputs = tf.expand_dims(score_outputs, axis=-1)
+      score_loss = self._binary_crossentropy(
+          score_targets, score_outputs, sample_weight=mask)
+
      score_loss /= normalizer
      return score_loss

@@ -147,7 +149,10 @@ class RpnBoxLoss(object):
  """Region Proposal Network box regression loss function."""

  def __init__(self, params):
-    self._delta = params.huber_loss_delta
+    logging.info('RpnBoxLoss huber_loss_delta %s', params.huber_loss_delta)
+    # The delta is typically around the mean value of regression target.
+    # for instances, the regression targets of 512x512 input with 6 anchors on
+    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
    self._huber_loss = tf.keras.losses.Huber(
        delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)

@@ -171,35 +176,32 @@ class RpnBoxLoss(object):

      box_losses = []
      for level in levels:
-        box_losses.append(
-            self._rpn_box_loss(
-                box_outputs[level], labels[level], delta=self._delta))
+        box_losses.append(self._rpn_box_loss(box_outputs[level], labels[level]))

      # Sum per level losses to total loss.
      return tf.add_n(box_losses)

-  def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0, delta=1./9):
+  def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0):
    """Computes box regression loss."""
-    # The delta is typically around the mean value of regression target.
-    # for instances, the regression targets of 512x512 input with 6 anchors on
-    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
    with tf.name_scope('rpn_box_loss'):
-      mask = tf.math.not_equal(box_targets, 0.0)
-      # The loss is normalized by the sum of non-zero weights before additional
-      # normalizer provided by the function caller.
-      box_loss = tf.compat.v1.losses.huber_loss(
-          box_targets,
-          box_outputs,
-          weights=mask,
-          delta=delta,
-          reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
-      box_loss /= normalizer
+      mask = tf.cast(tf.not_equal(box_targets, 0.0), dtype=tf.float32)
+      box_targets = tf.expand_dims(box_targets, axis=-1)
+      box_outputs = tf.expand_dims(box_outputs, axis=-1)
+      box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
+      # The loss is normalized by the sum of non-zero weights and additional
+      # normalizer provided by the function caller. Using + 0.01 here to avoid
+      # division by zero.
+      box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
      return box_loss


 class FastrcnnClassLoss(object):
  """Fast R-CNN classification loss function."""

+  def __init__(self):
+    self._categorical_crossentropy = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
+
  def __call__(self, class_outputs, class_targets):
    """Computes the class loss (Fast-RCNN branch) of Mask-RCNN.

@@ -218,24 +220,19 @@ class FastrcnnClassLoss(object):
      a scalar tensor representing total class loss.
    """
    with tf.name_scope('fast_rcnn_loss'):
-      _, _, num_classes = class_outputs.get_shape().as_list()
+      batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
      class_targets = tf.cast(class_targets, dtype=tf.int32)
      class_targets_one_hot = tf.one_hot(class_targets, num_classes)
-      return self._fast_rcnn_class_loss(class_outputs, class_targets_one_hot)
+      return self._fast_rcnn_class_loss(class_outputs, class_targets_one_hot,
+                                        normalizer=batch_size * num_boxes / 2.0)

  def _fast_rcnn_class_loss(self, class_outputs, class_targets_one_hot,
-                            normalizer=1.0):
+                            normalizer):
    """Computes classification loss."""
    with tf.name_scope('fast_rcnn_class_loss'):
-      # The loss is normalized by the sum of non-zero weights before additional
-      # normalizer provided by the function caller.
-      # Keep the compat.v1 loss because Keras does not have a
-      # softmax_cross_entropy substitution yet.
-      # TODO(b/143720144): replace this loss.
-      class_loss = tf.compat.v1.losses.softmax_cross_entropy(
-          class_targets_one_hot,
-          class_outputs,
-          reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+      class_loss = self._categorical_crossentropy(class_targets_one_hot,
+                                                  class_outputs)
+
      class_loss /= normalizer
      return class_loss

@@ -244,7 +241,12 @@ class FastrcnnBoxLoss(object):
  """Fast R-CNN box regression loss function."""

  def __init__(self, params):
-    self._delta = params.huber_loss_delta
+    logging.info('FastrcnnBoxLoss huber_loss_delta %s', params.huber_loss_delta)
+    # The delta is typically around the mean value of regression target.
+    # for instances, the regression targets of 512x512 input with 6 anchors on
+    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+    self._huber_loss = tf.keras.losses.Huber(
+        delta=params.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)

  def __call__(self, box_outputs, class_targets, box_targets):
    """Computes the box loss (Fast-RCNN branch) of Mask-RCNN.
@@ -296,36 +298,32 @@ class FastrcnnBoxLoss(object):
              dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
      box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])

-      return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets,
-                                      delta=self._delta)
+      return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets)

  def _fast_rcnn_box_loss(self, box_outputs, box_targets, class_targets,
-                          normalizer=1.0, delta=1.):
+                          normalizer=1.0):
    """Computes box regression loss."""
-    # The delta is typically around the mean value of regression target.
-    # for instances, the regression targets of 512x512 input with 6 anchors on
-    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
    with tf.name_scope('fast_rcnn_box_loss'):
      mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
                     [1, 1, 4])
-      # The loss is normalized by the sum of non-zero weights before additional
-      # normalizer provided by the function caller.
-      # Keep the compat.v1 loss because Keras does not have a
-      # Reduction.SUM_BY_NONZERO_WEIGHTS substitution yet.
-      # TODO(b/143720144): replace this loss.
-      box_loss = tf.compat.v1.losses.huber_loss(
-          box_targets,
-          box_outputs,
-          weights=mask,
-          delta=delta,
-          reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
-      box_loss /= normalizer
+      mask = tf.cast(mask, dtype=tf.float32)
+      box_targets = tf.expand_dims(box_targets, axis=-1)
+      box_outputs = tf.expand_dims(box_outputs, axis=-1)
+      box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
+      # The loss is normalized by the number of ones in mask,
+      # additianal normalizer provided by the user and using 0.01 here to avoid
+      # division by 0.
+      box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
      return box_loss


 class MaskrcnnLoss(object):
  """Mask R-CNN instance segmentation mask loss function."""

+  def __init__(self):
+    self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
+        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
+
  def __call__(self, mask_outputs, mask_targets, select_class_targets):
    """Computes the mask loss of Mask-RCNN.

@@ -358,11 +356,16 @@ class MaskrcnnLoss(object):
          tf.reshape(tf.greater(select_class_targets, 0),
                     [batch_size, num_masks, 1, 1]),
          [1, 1, mask_height, mask_width])
-      return tf.compat.v1.losses.sigmoid_cross_entropy(
-          mask_targets,
-          mask_outputs,
-          weights=weights,
-          reduction=tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+      weights = tf.cast(weights, dtype=tf.float32)
+
+      mask_targets = tf.expand_dims(mask_targets, axis=-1)
+      mask_outputs = tf.expand_dims(mask_outputs, axis=-1)
+      mask_loss = self._binary_crossentropy(mask_targets, mask_outputs,
+                                            sample_weight=weights)
+
+      # The loss is normalized by the number of 1's in weights and
+      # + 0.01 is used to avoid division by zero.
+      return mask_loss / (tf.reduce_sum(weights) + 0.01)


 class RetinanetClassLoss(object):

--- a/official/vision/detection/modeling/maskrcnn_model.py
+++ b/official/vision/detection/modeling/maskrcnn_model.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model defination for the Mask R-CNN Model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow.python.keras import backend
+from official.vision.detection.dataloader import anchor
+from official.vision.detection.dataloader import mode_keys
+from official.vision.detection.evaluation import factory as eval_factory
+from official.vision.detection.modeling import base_model
+from official.vision.detection.modeling import losses
+from official.vision.detection.modeling.architecture import factory
+from official.vision.detection.ops import postprocess_ops
+from official.vision.detection.ops import roi_ops
+from official.vision.detection.ops import sampling_ops
+from official.vision.detection.ops import spatial_transform_ops
+from official.vision.detection.utils import box_utils
+
+
+class MaskrcnnModel(base_model.Model):
+  """Mask R-CNN model function."""
+
+  def __init__(self, params):
+    super(MaskrcnnModel, self).__init__(params)
+
+    # For eval metrics.
+    self._params = params
+    self._keras_model = None
+
+    self._include_mask = params.architecture.include_mask
+
+    # Architecture generators.
+    self._backbone_fn = factory.backbone_generator(params)
+    self._fpn_fn = factory.multilevel_features_generator(params)
+    self._rpn_head_fn = factory.rpn_head_generator(params.rpn_head)
+    self._generate_rois_fn = roi_ops.ROIGenerator(params.roi_proposal)
+    self._sample_rois_fn = sampling_ops.ROISampler(params.roi_sampling)
+    self._sample_masks_fn = sampling_ops.MaskSampler(params.mask_sampling)
+
+    self._frcnn_head_fn = factory.fast_rcnn_head_generator(params.frcnn_head)
+    if self._include_mask:
+      self._mrcnn_head_fn = factory.mask_rcnn_head_generator(params.mrcnn_head)
+
+    # Loss function.
+    self._rpn_score_loss_fn = losses.RpnScoreLoss(params.rpn_score_loss)
+    self._rpn_box_loss_fn = losses.RpnBoxLoss(params.rpn_box_loss)
+    self._frcnn_class_loss_fn = losses.FastrcnnClassLoss()
+    self._frcnn_box_loss_fn = losses.FastrcnnBoxLoss(params.frcnn_box_loss)
+    if self._include_mask:
+      self._mask_loss_fn = losses.MaskrcnnLoss()
+
+    self._generate_detections_fn = postprocess_ops.GenericDetectionGenerator(
+        params.postprocess)
+
+    self._transpose_input = params.train.transpose_input
+    assert not self._transpose_input, 'Transpose input is not supportted.'
+
+  def build_outputs(self, inputs, mode):
+    is_training = mode == mode_keys.TRAIN
+    model_outputs = {}
+
+    image = inputs['image']
+    _, image_height, image_width, _ = image.get_shape().as_list()
+    backbone_features = self._backbone_fn(image, is_training)
+    fpn_features = self._fpn_fn(backbone_features, is_training)
+
+    rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
+        fpn_features, is_training)
+    model_outputs.update({
+        'rpn_score_outputs':
+            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                  rpn_score_outputs),
+        'rpn_box_outputs':
+            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                  rpn_box_outputs),
+    })
+    input_anchor = anchor.Anchor(self._params.anchor.min_level,
+                                 self._params.anchor.max_level,
+                                 self._params.anchor.num_scales,
+                                 self._params.anchor.aspect_ratios,
+                                 self._params.anchor.anchor_size,
+                                 (image_height, image_width))
+    rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs, rpn_score_outputs,
+                                         input_anchor.multilevel_boxes,
+                                         inputs['image_info'][:, 1, :],
+                                         is_training)
+    if is_training:
+      rpn_rois = tf.stop_gradient(rpn_rois)
+
+      # Sample proposals.
+      rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
+          self._sample_rois_fn(rpn_rois, inputs['gt_boxes'],
+                               inputs['gt_classes']))
+
+      # Create bounding box training targets.
+      box_targets = box_utils.encode_boxes(
+          matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
+      # If the target is background, the box target is set to all 0s.
+      box_targets = tf.where(
+          tf.tile(
+              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
+              [1, 1, 4]),
+          tf.zeros_like(box_targets),
+          box_targets)
+      model_outputs.update({
+          'class_targets': matched_gt_classes,
+          'box_targets': box_targets,
+      })
+
+    roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        fpn_features, rpn_rois, output_size=7)
+
+    class_outputs, box_outputs = self._frcnn_head_fn(roi_features, is_training)
+
+    model_outputs.update({
+        'class_outputs':
+            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                  class_outputs),
+        'box_outputs':
+            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                  box_outputs),
+    })
+
+    # Add this output to train to make the checkpoint loadable in predict mode.
+    # If we skip it in train mode, the heads will be out-of-order and checkpoint
+    # loading will fail.
+    boxes, scores, classes, valid_detections = self._generate_detections_fn(
+        box_outputs, class_outputs, rpn_rois, inputs['image_info'][:, 1:2, :])
+    model_outputs.update({
+        'num_detections': valid_detections,
+        'detection_boxes': boxes,
+        'detection_classes': classes,
+        'detection_scores': scores,
+    })
+
+    if not self._include_mask:
+      return model_outputs
+
+    if is_training:
+      rpn_rois, classes, mask_targets = self._sample_masks_fn(
+          rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices,
+          inputs['gt_masks'])
+      mask_targets = tf.stop_gradient(mask_targets)
+
+      classes = tf.cast(classes, dtype=tf.int32)
+
+      model_outputs.update({
+          'mask_targets': mask_targets,
+          'sampled_class_targets': classes,
+      })
+    else:
+      rpn_rois = boxes
+      classes = tf.cast(classes, dtype=tf.int32)
+
+    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        fpn_features, rpn_rois, output_size=14)
+
+    mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training)
+
+    if is_training:
+      model_outputs.update({
+          'mask_outputs':
+              tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
+                                    mask_outputs),
+      })
+    else:
+      model_outputs.update({
+          'detection_masks': tf.nn.sigmoid(mask_outputs)
+      })
+
+    return model_outputs
+
+  def build_loss_fn(self):
+    if self._keras_model is None:
+      raise ValueError('build_loss_fn() must be called after build_model().')
+
+    filter_fn = self.make_filter_trainable_variables_fn()
+    trainable_variables = filter_fn(self._keras_model.trainable_variables)
+
+    def _total_loss_fn(labels, outputs):
+      rpn_score_loss = self._rpn_score_loss_fn(outputs['rpn_score_outputs'],
+                                               labels['rpn_score_targets'])
+      rpn_box_loss = self._rpn_box_loss_fn(outputs['rpn_box_outputs'],
+                                           labels['rpn_box_targets'])
+
+      frcnn_class_loss = self._frcnn_class_loss_fn(outputs['class_outputs'],
+                                                   outputs['class_targets'])
+      frcnn_box_loss = self._frcnn_box_loss_fn(outputs['box_outputs'],
+                                               outputs['class_targets'],
+                                               outputs['box_targets'])
+
+      if self._include_mask:
+        mask_loss = self._mask_loss_fn(outputs['mask_outputs'],
+                                       outputs['mask_targets'],
+                                       outputs['sampled_class_targets'])
+      else:
+        mask_loss = 0.0
+
+      model_loss = (
+          rpn_score_loss + rpn_box_loss + frcnn_class_loss + frcnn_box_loss +
+          mask_loss)
+
+      l2_regularization_loss = self.weight_decay_loss(trainable_variables)
+      total_loss = model_loss + l2_regularization_loss
+      return {
+          'total_loss': total_loss,
+          'loss': total_loss,
+          'fast_rcnn_class_loss': frcnn_class_loss,
+          'fast_rcnn_box_loss': frcnn_box_loss,
+          'mask_loss': mask_loss,
+          'model_loss': model_loss,
+          'l2_regularization_loss': l2_regularization_loss,
+          'rpn_score_loss': rpn_score_loss,
+          'rpn_box_loss': rpn_box_loss,
+      }
+
+    return _total_loss_fn
+
+  def build_input_layers(self, params, mode):
+    is_training = mode == mode_keys.TRAIN
+    input_shape = (
+        params.maskrcnn_parser.output_size +
+        [params.maskrcnn_parser.num_channels])
+    if is_training:
+      batch_size = params.train.batch_size
+      input_layer = {
+          'image':
+              tf.keras.layers.Input(
+                  shape=input_shape,
+                  batch_size=batch_size,
+                  name='image',
+                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
+          'image_info':
+              tf.keras.layers.Input(
+                  shape=[4, 2],
+                  batch_size=batch_size,
+                  name='image_info',
+              ),
+          'gt_boxes':
+              tf.keras.layers.Input(
+                  shape=[params.maskrcnn_parser.max_num_instances, 4],
+                  batch_size=batch_size,
+                  name='gt_boxes'),
+          'gt_classes':
+              tf.keras.layers.Input(
+                  shape=[params.maskrcnn_parser.max_num_instances],
+                  batch_size=batch_size,
+                  name='gt_classes',
+                  dtype=tf.int64),
+      }
+      if self._include_mask:
+        input_layer['gt_masks'] = tf.keras.layers.Input(
+            shape=[
+                params.maskrcnn_parser.max_num_instances,
+                params.maskrcnn_parser.mask_crop_size,
+                params.maskrcnn_parser.mask_crop_size
+            ],
+            batch_size=batch_size,
+            name='gt_masks')
+    else:
+      batch_size = params.eval.batch_size
+      input_layer = {
+          'image':
+              tf.keras.layers.Input(
+                  shape=input_shape,
+                  batch_size=batch_size,
+                  name='image',
+                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
+          'image_info':
+              tf.keras.layers.Input(
+                  shape=[4, 2],
+                  batch_size=batch_size,
+                  name='image_info',
+              ),
+      }
+    return input_layer
+
+  def build_model(self, params, mode):
+    if self._keras_model is None:
+      input_layers = self.build_input_layers(self._params, mode)
+      with backend.get_graph().as_default():
+        outputs = self.model_outputs(input_layers, mode)
+
+        model = tf.keras.models.Model(
+            inputs=input_layers, outputs=outputs, name='maskrcnn')
+        assert model is not None, 'Fail to build tf.keras.Model.'
+        model.optimizer = self.build_optimizer()
+        self._keras_model = model
+
+    return self._keras_model
+
+  def post_processing(self, labels, outputs):
+    required_output_fields = ['class_outputs', 'box_outputs']
+    for field in required_output_fields:
+      if field not in outputs:
+        raise ValueError('"%s" is missing in outputs, requried %s found %s'
+                         %(field, required_output_fields, outputs.keys()))
+    predictions = {
+        'image_info': labels['image_info'],
+        'num_detections': outputs['num_detections'],
+        'detection_boxes': outputs['detection_boxes'],
+        'detection_classes': outputs['detection_classes'],
+        'detection_scores': outputs['detection_scores'],
+    }
+    if self._include_mask:
+      predictions.update({
+          'detection_masks': outputs['detection_masks'],
+      })
+
+    if 'groundtruths' in labels:
+      predictions['source_id'] = labels['groundtruths']['source_id']
+      predictions['gt_source_id'] = labels['groundtruths']['source_id']
+      predictions['gt_height'] = labels['groundtruths']['height']
+      predictions['gt_width'] = labels['groundtruths']['width']
+      predictions['gt_image_info'] = labels['image_info']
+      predictions['gt_num_detections'] = (
+          labels['groundtruths']['num_detections'])
+      predictions['gt_boxes'] = labels['groundtruths']['boxes']
+      predictions['gt_classes'] = labels['groundtruths']['classes']
+      predictions['gt_areas'] = labels['groundtruths']['areas']
+      predictions['gt_is_crowds'] = labels['groundtruths']['is_crowds']
+    return labels, predictions
+
+  def eval_metrics(self):
+    return eval_factory.evaluator_generator(self._params.eval)
--- a/official/vision/detection/utils/object_detection/visualization_utils.py
+++ b/official/vision/detection/utils/object_detection/visualization_utils.py
@@ -21,6 +21,7 @@ The functions do not return a value, instead they modify the image itself.
 """
 import collections
 import functools
+from absl import logging
 # Set headless-friendly backend.
 import matplotlib; matplotlib.use('Agg')  # pylint: disable=multiple-statements
 import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
@@ -97,6 +98,12 @@ def encode_image_array_as_png_str(image):
 def visualize_images_with_bounding_boxes(images, box_outputs, step,
                                         summary_writer):
  """Records subset of evaluation images with bounding boxes."""
+  if not isinstance(images, list):
+    logging.warning('visualize_images_with_bounding_boxes expects list of '
+                    'images but received type: %s and value: %s',
+                    type(images), images)
+    return
+
  image_shape = tf.shape(images[0])
  image_height = tf.cast(image_shape[0], tf.float32)
  image_width = tf.cast(image_shape[1], tf.float32)