Internal change

PiperOrigin-RevId: 329754787

Internal change
PiperOrigin-RevId: 329754787
cc748b2a · Abdullah Rashwan · A. Unique TensorFlower · 2f788e1d · cc748b2a · cc748b2a
Commit cc748b2a authored Sep 02, 2020 by Abdullah Rashwan Committed by A. Unique TensorFlower Sep 02, 2020
20 changed files
--- a/official/vision/beta/dataloaders/parser.py
+++ b/official/vision/beta/dataloaders/parser.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The generic parser interface."""
+
+import abc
+
+
+class Parser(object):
+  """Parses data and produces tensors to be consumed by models."""
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def _parse_train_data(self, decoded_tensors):
+    """Generates images and labels that are usable for model training.
+
+    Args:
+      decoded_tensors: a dict of Tensors produced by the decoder.
+
+    Returns:
+      images: the image tensor.
+      labels: a dict of Tensors that contains labels.
+    """
+    pass
+
+  @abc.abstractmethod
+  def _parse_eval_data(self, decoded_tensors):
+    """Generates images and labels that are usable for model evaluation.
+
+    Args:
+      decoded_tensors: a dict of Tensors produced by the decoder.
+
+    Returns:
+      images: the image tensor.
+      labels: a dict of Tensors that contains labels.
+    """
+    pass
+
+  def parse_fn(self, is_training):
+    """Returns a parse fn that reads and parses raw tensors from the decoder.
+
+    Args:
+      is_training: a `bool` to indicate whether it is in training mode.
+
+    Returns:
+      parse: a `callable` that takes the serialized examle and generate the
+        images, labels tuple where labels is a dict of Tensors that contains
+        labels.
+    """
+    def parse(decoded_tensors):
+      """Parses the serialized example data."""
+      if is_training:
+        return self._parse_train_data(decoded_tensors)
+      else:
+        return self._parse_eval_data(decoded_tensors)
+
+    return parse
--- a/official/vision/beta/dataloaders/retinanet_input.py
+++ b/official/vision/beta/dataloaders/retinanet_input.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data parser and processing for RetinaNet.
+
+Parse image and ground truths in a dataset to training targets and package them
+into (image, labels) tuple for RetinaNet.
+"""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import parser
+from official.vision.beta.dataloaders import utils
+from official.vision.beta.ops import anchor
+from official.vision.beta.ops import box_ops
+from official.vision.beta.ops import preprocess_ops
+
+
+class Parser(parser.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+
+  def __init__(self,
+               output_size,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               match_threshold=0.5,
+               unmatched_threshold=0.5,
+               aug_rand_hflip=False,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               use_autoaugment=False,
+               autoaugment_policy_name='v0',
+               skip_crowd_during_training=True,
+               max_num_instances=100,
+               dtype='bfloat16',
+               mode=None):
+    """Initializes parameters for parsing annotations in the dataset.
+
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      min_level: `int` number of minimum level of the output feature pyramid.
+      max_level: `int` number of maximum level of the output feature pyramid.
+      num_scales: `int` number representing intermediate scales added on each
+        level. For instances, num_scales=2 adds one additional intermediate
+        anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: `list` of float numbers representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: `float` number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      match_threshold: `float` number between 0 and 1 representing the
+        lower-bound threshold to assign positive labels for anchors. An anchor
+        with a score over the threshold is labeled positive.
+      unmatched_threshold: `float` number between 0 and 1 representing the
+        upper-bound threshold to assign negative labels for anchors. An anchor
+        with a score below the threshold is labeled negative.
+      aug_rand_hflip: `bool`, if True, augment training with random horizontal
+        flip.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
+        during training.
+      autoaugment_policy_name: `string` that specifies the name of the
+        AutoAugment policy that will be used during training.
+      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
+        `is_crowd` equals to 1.
+      max_num_instances: `int` number of maximum number of instances in an
+        image. The groundtruth data will be padded to `max_num_instances`.
+      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
+      mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
+        prediction with groundtruths in the outputs.
+    """
+    self._mode = mode
+    self._max_num_instances = max_num_instances
+    self._skip_crowd_during_training = skip_crowd_during_training
+
+    # Anchor.
+    self._output_size = output_size
+    self._min_level = min_level
+    self._max_level = max_level
+    self._num_scales = num_scales
+    self._aspect_ratios = aspect_ratios
+    self._anchor_size = anchor_size
+    self._match_threshold = match_threshold
+    self._unmatched_threshold = unmatched_threshold
+
+    # Data augmentation.
+    self._aug_rand_hflip = aug_rand_hflip
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+
+    # Data Augmentation with AutoAugment.
+    self._use_autoaugment = use_autoaugment
+    self._autoaugment_policy_name = autoaugment_policy_name
+
+    # Device.
+    self._use_bfloat16 = True if dtype == 'bfloat16' else False
+
+  def _parse_train_data(self, data):
+    """Parses data for training and evaluation."""
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+    is_crowds = data['groundtruth_is_crowd']
+    # Skips annotations with `is_crowd` = True.
+    if self._skip_crowd_during_training:
+      num_groundtrtuhs = tf.shape(input=classes)[0]
+      with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
+        indices = tf.cond(
+            pred=tf.greater(tf.size(input=is_crowds), 0),
+            true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
+            false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
+      classes = tf.gather(classes, indices)
+      boxes = tf.gather(boxes, indices)
+
+    # Gets original image and its size.
+    image = data['image']
+
+    image_shape = tf.shape(input=image)[0:2]
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image)
+
+    # Flips image randomly during training.
+    if self._aug_rand_hflip:
+      image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)
+
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    boxes = box_ops.denormalize_boxes(boxes, image_shape)
+
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=preprocess_ops.compute_padded_size(self._output_size,
+                                                       2**self._max_level),
+        aug_scale_min=self._aug_scale_min,
+        aug_scale_max=self._aug_scale_max)
+    image_height, image_width, _ = image.get_shape().as_list()
+
+    # Resizes and crops boxes.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
+                                                 image_info[1, :], offset)
+    # Filters out ground truth boxes that are all zeros.
+    indices = box_ops.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+
+    # Assigns anchors.
+    input_anchor = anchor.build_anchor_generator(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        num_scales=self._num_scales,
+        aspect_ratios=self._aspect_ratios,
+        anchor_size=self._anchor_size)
+    anchor_boxes = input_anchor(image_size=(image_height, image_width))
+    anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
+                                          self._unmatched_threshold)
+    (cls_targets, box_targets, cls_weights,
+     box_weights) = anchor_labeler.label_anchors(
+         anchor_boxes, boxes,
+         tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
+
+    # If bfloat16 is used, casts input image to tf.bfloat16.
+    if self._use_bfloat16:
+      image = tf.cast(image, dtype=tf.bfloat16)
+
+    # Packs labels for model_fn outputs.
+    labels = {
+        'cls_targets': cls_targets,
+        'box_targets': box_targets,
+        'anchor_boxes': anchor_boxes,
+        'cls_weights': cls_weights,
+        'box_weights': box_weights,
+        'image_info': image_info,
+    }
+    return image, labels
+
+  def _parse_eval_data(self, data):
+    """Parses data for training and evaluation."""
+    groundtruths = {}
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+
+    # Gets original image and its size.
+    image = data['image']
+    image_shape = tf.shape(input=image)[0:2]
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image)
+
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    boxes = box_ops.denormalize_boxes(boxes, image_shape)
+
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=preprocess_ops.compute_padded_size(self._output_size,
+                                                       2**self._max_level),
+        aug_scale_min=1.0,
+        aug_scale_max=1.0)
+    image_height, image_width, _ = image.get_shape().as_list()
+
+    # Resizes and crops boxes.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
+                                                 image_info[1, :], offset)
+    # Filters out ground truth boxes that are all zeros.
+    indices = box_ops.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+
+    # Assigns anchors.
+    input_anchor = anchor.build_anchor_generator(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        num_scales=self._num_scales,
+        aspect_ratios=self._aspect_ratios,
+        anchor_size=self._anchor_size)
+    anchor_boxes = input_anchor(image_size=(image_height, image_width))
+    anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
+                                          self._unmatched_threshold)
+    (cls_targets, box_targets, cls_weights,
+     box_weights) = anchor_labeler.label_anchors(
+         anchor_boxes, boxes,
+         tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
+
+    # If bfloat16 is used, casts input image to tf.bfloat16.
+    if self._use_bfloat16:
+      image = tf.cast(image, dtype=tf.bfloat16)
+
+    # Sets up groundtruth data for evaluation.
+    groundtruths = {
+        'source_id': data['source_id'],
+        'height': data['height'],
+        'width': data['width'],
+        'num_detections': tf.shape(data['groundtruth_classes']),
+        'image_info': image_info,
+        'boxes': box_ops.denormalize_boxes(
+            data['groundtruth_boxes'], image_shape),
+        'classes': data['groundtruth_classes'],
+        'areas': data['groundtruth_area'],
+        'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
+    }
+    groundtruths['source_id'] = utils.process_source_id(
+        groundtruths['source_id'])
+    groundtruths = utils.pad_groundtruths_to_fixed_size(
+        groundtruths, self._max_num_instances)
+
+    # Packs labels for model_fn outputs.
+    labels = {
+        'cls_targets': cls_targets,
+        'box_targets': box_targets,
+        'anchor_boxes': anchor_boxes,
+        'cls_weights': cls_weights,
+        'box_weights': box_weights,
+        'image_info': image_info,
+        'groundtruths': groundtruths,
+    }
+    return image, labels
--- a/official/vision/beta/dataloaders/retinanet_input_test.py
+++ b/official/vision/beta/dataloaders/retinanet_input_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for retinanet_parser.py."""
+
+# Import libraries
+from absl.testing import parameterized
+
+import tensorflow as tf
+from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.vision.beta.dataloaders import retinanet_input
+from official.vision.beta.dataloaders import tf_example_decoder
+
+
+class RetinaNetInputTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ([512, 640], True, True, True),
+      ([640, 640], False, False, False),
+  )
+  def testRetinanetInputReader(self,
+                               output_size,
+                               skip_crowd_during_training,
+                               use_autoaugment,
+                               is_training):
+
+    batch_size = 2
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [0.5, 1.0, 2.0]
+    anchor_size = 3
+    max_num_instances = 100
+
+    params = cfg.DataConfig(
+        input_path='/placer/prod/home/snaggletooth/test/data/coco/val*',
+        global_batch_size=batch_size,
+        is_training=is_training)
+
+    decoder = tf_example_decoder.TfExampleDecoder()
+    parser = retinanet_input.Parser(
+        output_size=output_size,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size,
+        skip_crowd_during_training=skip_crowd_during_training,
+        use_autoaugment=use_autoaugment,
+        max_num_instances=max_num_instances,
+        dtype='bfloat16')
+
+    reader = input_reader.InputReader(
+        params,
+        dataset_fn=tf.data.TFRecordDataset,
+        decoder_fn=decoder.decode,
+        parser_fn=parser.parse_fn(params.is_training))
+
+    dataset = reader.read()
+
+    iterator = iter(dataset)
+    image, labels = next(iterator)
+    np_image = image.numpy()
+    np_labels = tf.nest.map_structure(lambda x: x.numpy(), labels)
+
+    # Checks image shape.
+    self.assertEqual(list(np_image.shape),
+                     [batch_size, output_size[0], output_size[1], 3])
+    # Checks keys in labels.
+    if is_training:
+      self.assertCountEqual(
+          np_labels.keys(),
+          ['cls_targets', 'box_targets', 'anchor_boxes', 'cls_weights',
+           'box_weights', 'image_info'])
+    else:
+      self.assertCountEqual(
+          np_labels.keys(),
+          ['cls_targets', 'box_targets', 'anchor_boxes', 'cls_weights',
+           'box_weights', 'groundtruths', 'image_info'])
+    # Checks shapes of `image_info` and `anchor_boxes`.
+    self.assertEqual(np_labels['image_info'].shape, (batch_size, 4, 2))
+    n_anchors = 0
+    for level in range(min_level, max_level + 1):
+      stride = 2 ** level
+      output_size_l = [output_size[0] / stride, output_size[1] / stride]
+      anchors_per_location = num_scales * len(aspect_ratios)
+      self.assertEqual(
+          list(np_labels['anchor_boxes'][level].shape),
+          [batch_size, output_size_l[0], output_size_l[1],
+           4 * anchors_per_location])
+      n_anchors += output_size_l[0] * output_size_l[1] * anchors_per_location
+    # Checks shapes of training objectives.
+    self.assertEqual(np_labels['cls_weights'].shape, (batch_size, n_anchors))
+    for level in range(min_level, max_level + 1):
+      stride = 2 ** level
+      output_size_l = [output_size[0] / stride, output_size[1] / stride]
+      anchors_per_location = num_scales * len(aspect_ratios)
+      self.assertEqual(
+          list(np_labels['cls_targets'][level].shape),
+          [batch_size, output_size_l[0], output_size_l[1],
+           anchors_per_location])
+      self.assertEqual(
+          list(np_labels['box_targets'][level].shape),
+          [batch_size, output_size_l[0], output_size_l[1],
+           4 * anchors_per_location])
+    # Checks shape of groundtruths for eval.
+    if not is_training:
+      self.assertEqual(np_labels['groundtruths']['source_id'].shape,
+                       (batch_size,))
+      self.assertEqual(np_labels['groundtruths']['classes'].shape,
+                       (batch_size, max_num_instances))
+      self.assertEqual(np_labels['groundtruths']['boxes'].shape,
+                       (batch_size, max_num_instances, 4))
+      self.assertEqual(np_labels['groundtruths']['areas'].shape,
+                       (batch_size, max_num_instances))
+      self.assertEqual(np_labels['groundtruths']['is_crowds'].shape,
+                       (batch_size, max_num_instances))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/dataloaders/tf_example_decoder.py
+++ b/official/vision/beta/dataloaders/tf_example_decoder.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensorflow Example proto decoder for object detection.
+
+A decoder to decode string tensors containing serialized tensorflow.Example
+protos for object detection.
+"""
+import csv
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import decoder
+
+
+def _generate_source_id(image_bytes):
+  return tf.strings.as_string(
+      tf.strings.to_hash_bucket_fast(image_bytes, 2 ** 63 - 1))
+
+
+class TfExampleDecoder(decoder.Decoder):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self,
+               include_mask=False,
+               regenerate_source_id=False):
+    self._include_mask = include_mask
+    self._regenerate_source_id = regenerate_source_id
+    self._keys_to_features = {
+        'image/encoded': tf.io.FixedLenFeature((), tf.string),
+        'image/source_id': tf.io.FixedLenFeature((), tf.string),
+        'image/height': tf.io.FixedLenFeature((), tf.int64),
+        'image/width': tf.io.FixedLenFeature((), tf.int64),
+        'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
+        'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
+        'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
+        'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
+        'image/object/class/label': tf.io.VarLenFeature(tf.int64),
+        'image/object/area': tf.io.VarLenFeature(tf.float32),
+        'image/object/is_crowd': tf.io.VarLenFeature(tf.int64),
+    }
+    if include_mask:
+      self._keys_to_features.update({
+          'image/object/mask': tf.io.VarLenFeature(tf.string),
+      })
+
+  def _decode_image(self, parsed_tensors):
+    """Decodes the image and set its static shape."""
+    image = tf.io.decode_image(parsed_tensors['image/encoded'], channels=3)
+    image.set_shape([None, None, 3])
+    return image
+
+  def _decode_boxes(self, parsed_tensors):
+    """Concat box coordinates in the format of [ymin, xmin, ymax, xmax]."""
+    xmin = parsed_tensors['image/object/bbox/xmin']
+    xmax = parsed_tensors['image/object/bbox/xmax']
+    ymin = parsed_tensors['image/object/bbox/ymin']
+    ymax = parsed_tensors['image/object/bbox/ymax']
+    return tf.stack([ymin, xmin, ymax, xmax], axis=-1)
+
+  def _decode_classes(self, parsed_tensors):
+    return parsed_tensors['image/object/class/label']
+
+  def _decode_areas(self, parsed_tensors):
+    xmin = parsed_tensors['image/object/bbox/xmin']
+    xmax = parsed_tensors['image/object/bbox/xmax']
+    ymin = parsed_tensors['image/object/bbox/ymin']
+    ymax = parsed_tensors['image/object/bbox/ymax']
+    height = tf.cast(parsed_tensors['image/height'], dtype=tf.float32)
+    width = tf.cast(parsed_tensors['image/width'], dtype=tf.float32)
+    return tf.cond(
+        tf.greater(tf.shape(parsed_tensors['image/object/area'])[0], 0),
+        lambda: parsed_tensors['image/object/area'],
+        lambda: (xmax - xmin) * (ymax - ymin) * height * width)
+
+  def _decode_masks(self, parsed_tensors):
+    """Decode a set of PNG masks to the tf.float32 tensors."""
+
+    def _decode_png_mask(png_bytes):
+      mask = tf.squeeze(
+          tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1)
+      mask = tf.cast(mask, dtype=tf.float32)
+      mask.set_shape([None, None])
+      return mask
+
+    height = parsed_tensors['image/height']
+    width = parsed_tensors['image/width']
+    masks = parsed_tensors['image/object/mask']
+    return tf.cond(
+        pred=tf.greater(tf.size(input=masks), 0),
+        true_fn=lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32),
+        false_fn=lambda: tf.zeros([0, height, width], dtype=tf.float32))
+
+  def decode(self, serialized_example):
+    """Decode the serialized example.
+
+    Args:
+      serialized_example: a single serialized tf.Example string.
+
+    Returns:
+      decoded_tensors: a dictionary of tensors with the following fields:
+        - source_id: a string scalar tensor.
+        - image: a uint8 tensor of shape [None, None, 3].
+        - height: an integer scalar tensor.
+        - width: an integer scalar tensor.
+        - groundtruth_classes: a int64 tensor of shape [None].
+        - groundtruth_is_crowd: a bool tensor of shape [None].
+        - groundtruth_area: a float32 tensor of shape [None].
+        - groundtruth_boxes: a float32 tensor of shape [None, 4].
+        - groundtruth_instance_masks: a float32 tensor of shape
+            [None, None, None].
+        - groundtruth_instance_masks_png: a string tensor of shape [None].
+    """
+    parsed_tensors = tf.io.parse_single_example(
+        serialized=serialized_example, features=self._keys_to_features)
+    for k in parsed_tensors:
+      if isinstance(parsed_tensors[k], tf.SparseTensor):
+        if parsed_tensors[k].dtype == tf.string:
+          parsed_tensors[k] = tf.sparse.to_dense(
+              parsed_tensors[k], default_value='')
+        else:
+          parsed_tensors[k] = tf.sparse.to_dense(
+              parsed_tensors[k], default_value=0)
+
+    if self._regenerate_source_id:
+      source_id = _generate_source_id(parsed_tensors['image/encoded'])
+    else:
+      source_id = tf.cond(
+          tf.greater(tf.strings.length(parsed_tensors['image/source_id']), 0),
+          lambda: parsed_tensors['image/source_id'],
+          lambda: _generate_source_id(parsed_tensors['image/encoded']))
+    image = self._decode_image(parsed_tensors)
+    boxes = self._decode_boxes(parsed_tensors)
+    classes = self._decode_classes(parsed_tensors)
+    areas = self._decode_areas(parsed_tensors)
+    is_crowds = tf.cond(
+        tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0),
+        lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool),
+        lambda: tf.zeros_like(classes, dtype=tf.bool))
+    if self._include_mask:
+      masks = self._decode_masks(parsed_tensors)
+
+    decoded_tensors = {
+        'source_id': source_id,
+        'image': image,
+        'height': parsed_tensors['image/height'],
+        'width': parsed_tensors['image/width'],
+        'groundtruth_classes': classes,
+        'groundtruth_is_crowd': is_crowds,
+        'groundtruth_area': areas,
+        'groundtruth_boxes': boxes,
+    }
+    if self._include_mask:
+      decoded_tensors.update({
+          'groundtruth_instance_masks': masks,
+          'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'],
+      })
+    return decoded_tensors
+
+
+class TfExampleDecoderLabelMap(TfExampleDecoder):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self, label_map, include_mask=False, regenerate_source_id=False):
+    super(TfExampleDecoderLabelMap, self).__init__(
+        include_mask=include_mask, regenerate_source_id=regenerate_source_id)
+    self._keys_to_features.update({
+        'image/object/class/text': tf.io.VarLenFeature(tf.string),
+    })
+    name_to_id = self._process_label_map(label_map)
+    self._name_to_id_table = tf.lookup.StaticHashTable(
+        tf.lookup.KeyValueTensorInitializer(
+            keys=tf.constant(list(name_to_id.keys()), dtype=tf.string),
+            values=tf.constant(list(name_to_id.values()), dtype=tf.int64)),
+        default_value=-1)
+
+  def _process_label_map(self, label_map):
+    if label_map.endswith('.csv'):
+      name_to_id = self._process_csv(label_map)
+    else:
+      raise ValueError('The label map file is in incorrect format.')
+    return name_to_id
+
+  def _process_csv(self, label_map):
+    name_to_id = {}
+    with tf.io.gfile.GFile(label_map, 'r') as f:
+      reader = csv.reader(f, delimiter=',')
+      for row in reader:
+        if len(row) != 2:
+          raise ValueError('Each row of the csv label map file must be in '
+                           '`id,name` format. length = {}'.format(len(row)))
+        id_index = int(row[0])
+        name = row[1]
+        name_to_id[name] = id_index
+    return name_to_id
+
+  def _decode_classes(self, parsed_tensors):
+    return self._name_to_id_table.lookup(
+        parsed_tensors['image/object/class/text'])
--- a/official/vision/beta/dataloaders/tf_example_decoder_test.py
+++ b/official/vision/beta/dataloaders/tf_example_decoder_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf_example_decoder.py."""
+
+import io
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import tf_example_decoder
+
+
+DUMP_SOURCE_ID = b'123'
+
+
+def _encode_image(image_array, fmt):
+  image = Image.fromarray(image_array)
+  with io.BytesIO() as output:
+    image.save(output, format=fmt)
+    return output.getvalue()
+
+
+class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      (100, 100, 0, True),
+      (100, 100, 1, True),
+      (100, 100, 2, True),
+      (100, 100, 0, False),
+      (100, 100, 1, False),
+      (100, 100, 2, False),
+  )
+  def test_result_shape(self,
+                        image_height,
+                        image_width,
+                        num_instances,
+                        regenerate_source_id):
+    decoder = tf_example_decoder.TfExampleDecoder(
+        include_mask=True, regenerate_source_id=regenerate_source_id)
+
+    image = _encode_image(
+        np.uint8(np.random.rand(image_height, image_width, 3) * 255),
+        fmt='JPEG')
+    if num_instances == 0:
+      xmins = []
+      xmaxs = []
+      ymins = []
+      ymaxs = []
+      labels = []
+      areas = []
+      is_crowds = []
+      masks = []
+    else:
+      xmins = list(np.random.rand(num_instances))
+      xmaxs = list(np.random.rand(num_instances))
+      ymins = list(np.random.rand(num_instances))
+      ymaxs = list(np.random.rand(num_instances))
+      labels = list(np.random.randint(100, size=num_instances))
+      areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
+               for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
+      is_crowds = [0] * num_instances
+      masks = []
+      for _ in range(num_instances):
+        mask = _encode_image(
+            np.uint8(np.random.rand(image_height, image_width) * 255),
+            fmt='PNG')
+        masks.append(mask)
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/label': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=labels))),
+                'image/object/is_crowd': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=is_crowds))),
+                'image/object/area': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=areas))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(value=serialized_example))
+
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    if not regenerate_source_id:
+      self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+
+  def test_result_content(self):
+    decoder = tf_example_decoder.TfExampleDecoder(include_mask=True)
+
+    image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
+    image = _encode_image(np.uint8(image_content), fmt='PNG')
+    image_height = 4
+    image_width = 4
+    num_instances = 2
+    xmins = [0, 0.25]
+    xmaxs = [0.5, 1.0]
+    ymins = [0, 0]
+    ymaxs = [0.5, 1.0]
+    labels = [3, 1]
+    areas = [
+        0.25 * image_height * image_width, 0.75 * image_height * image_width
+    ]
+    is_crowds = [1, 0]
+    mask_content = [[[255, 255, 0, 0],
+                     [255, 255, 0, 0],
+                     [0, 0, 0, 0],
+                     [0, 0, 0, 0]],
+                    [[0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255]]]
+    masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/label': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=labels))),
+                'image/object/is_crowd': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=is_crowds))),
+                'image/object/area': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=areas))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(value=serialized_example))
+
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    self.assertAllEqual(image_content, results['image'])
+    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+    self.assertAllEqual(
+        [3, 1], results['groundtruth_classes'])
+    self.assertAllEqual(
+        [True, False], results['groundtruth_is_crowd'])
+    self.assertNDArrayNear(
+        [0.25 * image_height * image_width, 0.75 * image_height * image_width],
+        results['groundtruth_area'], 1e-4)
+    self.assertNDArrayNear(
+        [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
+        results['groundtruth_boxes'], 1e-4)
+    self.assertNDArrayNear(
+        mask_content, results['groundtruth_instance_masks'], 1e-4)
+    self.assertAllEqual(
+        masks, results['groundtruth_instance_masks_png'])
+
+  def test_handling_missing_fields(self):
+    decoder = tf_example_decoder.TfExampleDecoder(include_mask=True)
+
+    image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
+    image = _encode_image(np.uint8(image_content), fmt='PNG')
+    image_height = 4
+    image_width = 4
+    num_instances = 2
+    xmins = [0, 0.25]
+    xmaxs = [0.5, 1.0]
+    ymins = [0, 0]
+    ymaxs = [0.5, 1.0]
+    labels = [3, 1]
+    mask_content = [[[255, 255, 0, 0],
+                     [255, 255, 0, 0],
+                     [0, 0, 0, 0],
+                     [0, 0, 0, 0]],
+                    [[0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255]]]
+    masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/label': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=labels))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(serialized_example))
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    self.assertAllEqual(image_content, results['image'])
+    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+    self.assertAllEqual(
+        [3, 1], results['groundtruth_classes'])
+    self.assertAllEqual(
+        [False, False], results['groundtruth_is_crowd'])
+    self.assertNDArrayNear(
+        [0.25 * image_height * image_width, 0.75 * image_height * image_width],
+        results['groundtruth_area'], 1e-4)
+    self.assertNDArrayNear(
+        [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
+        results['groundtruth_boxes'], 1e-4)
+    self.assertNDArrayNear(
+        mask_content, results['groundtruth_instance_masks'], 1e-4)
+    self.assertAllEqual(
+        masks, results['groundtruth_instance_masks_png'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/dataloaders/tf_example_label_map_decoder.py
+++ b/official/vision/beta/dataloaders/tf_example_label_map_decoder.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensorflow Example proto decoder for object detection.
+
+A decoder to decode string tensors containing serialized tensorflow.Example
+protos for object detection.
+"""
+import csv
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import tf_example_decoder
+
+
+class TfExampleDecoderLabelMap(tf_example_decoder.TfExampleDecoder):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self, label_map, include_mask=False, regenerate_source_id=False):
+    super(TfExampleDecoderLabelMap, self).__init__(
+        include_mask=include_mask, regenerate_source_id=regenerate_source_id)
+    self._keys_to_features.update({
+        'image/object/class/text': tf.io.VarLenFeature(tf.string),
+    })
+    name_to_id = self._process_label_map(label_map)
+    self._name_to_id_table = tf.lookup.StaticHashTable(
+        tf.lookup.KeyValueTensorInitializer(
+            keys=tf.constant(list(name_to_id.keys()), dtype=tf.string),
+            values=tf.constant(list(name_to_id.values()), dtype=tf.int64)),
+        default_value=-1)
+
+  def _process_label_map(self, label_map):
+    if label_map.endswith('.csv'):
+      name_to_id = self._process_csv(label_map)
+    else:
+      raise ValueError('The label map file is in incorrect format.')
+    return name_to_id
+
+  def _process_csv(self, label_map):
+    name_to_id = {}
+    with tf.io.gfile.GFile(label_map, 'r') as f:
+      reader = csv.reader(f, delimiter=',')
+      for row in reader:
+        if len(row) != 2:
+          raise ValueError('Each row of the csv label map file must be in '
+                           '`id,name` format. length = {}'.format(len(row)))
+        id_index = int(row[0])
+        name = row[1]
+        name_to_id[name] = id_index
+    return name_to_id
+
+  def _decode_classes(self, parsed_tensors):
+    return self._name_to_id_table.lookup(
+        parsed_tensors['image/object/class/text'])
--- a/official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py
+++ b/official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf_example_label_map_decoder.py."""
+
+import io
+import os
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import tf_example_label_map_decoder
+
+
+DUMP_SOURCE_ID = b'123'
+LABEL_MAP_CSV_CONTENT = '0,class_0\n1,class_1\n2,class_2'
+
+
+def _encode_image(image_array, fmt):
+  image = Image.fromarray(image_array)
+  with io.BytesIO() as output:
+    image.save(output, format=fmt)
+    return output.getvalue()
+
+
+class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      (100, 100, 0),
+      (100, 100, 1),
+      (100, 100, 2),
+      (100, 100, 0),
+      (100, 100, 1),
+      (100, 100, 2),
+  )
+  def test_result_shape(self, image_height, image_width, num_instances):
+    label_map_dir = self.get_temp_dir()
+    label_map_name = 'label_map.csv'
+    label_map_path = os.path.join(label_map_dir, label_map_name)
+    with open(label_map_path, 'w') as f:
+      f.write(LABEL_MAP_CSV_CONTENT)
+
+    decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
+        label_map_path, include_mask=True)
+
+    image = _encode_image(
+        np.uint8(np.random.rand(image_height, image_width, 3) * 255),
+        fmt='JPEG')
+    if num_instances == 0:
+      xmins = []
+      xmaxs = []
+      ymins = []
+      ymaxs = []
+      labels = []
+      areas = []
+      is_crowds = []
+      masks = []
+    else:
+      xmins = list(np.random.rand(num_instances))
+      xmaxs = list(np.random.rand(num_instances))
+      ymins = list(np.random.rand(num_instances))
+      ymaxs = list(np.random.rand(num_instances))
+      labels = list(np.random.randint(100, size=num_instances))
+      areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
+               for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
+      is_crowds = [0] * num_instances
+      masks = []
+      labels = [b'class_1'] * num_instances
+      for _ in range(num_instances):
+        mask = _encode_image(
+            np.uint8(np.random.rand(image_height, image_width) * 255),
+            fmt='PNG')
+        masks.append(mask)
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/text': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=labels))),
+                'image/object/is_crowd': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=is_crowds))),
+                'image/object/area': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=areas))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(value=serialized_example))
+
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+
+  def test_result_content(self):
+    label_map_dir = self.get_temp_dir()
+    label_map_name = 'label_map.csv'
+    label_map_path = os.path.join(label_map_dir, label_map_name)
+    with open(label_map_path, 'w') as f:
+      f.write(LABEL_MAP_CSV_CONTENT)
+
+    decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
+        label_map_path, include_mask=True)
+
+    image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
+    image = _encode_image(np.uint8(image_content), fmt='PNG')
+    image_height = 4
+    image_width = 4
+    num_instances = 2
+    xmins = [0, 0.25]
+    xmaxs = [0.5, 1.0]
+    ymins = [0, 0]
+    ymaxs = [0.5, 1.0]
+    labels = [b'class_2', b'class_0']
+    areas = [
+        0.25 * image_height * image_width, 0.75 * image_height * image_width
+    ]
+    is_crowds = [1, 0]
+    mask_content = [[[255, 255, 0, 0],
+                     [255, 255, 0, 0],
+                     [0, 0, 0, 0],
+                     [0, 0, 0, 0]],
+                    [[0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255]]]
+    masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/text': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=labels))),
+                'image/object/is_crowd': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=is_crowds))),
+                'image/object/area': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=areas))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(value=serialized_example))
+
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    self.assertAllEqual(image_content, results['image'])
+    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+    self.assertAllEqual(
+        [2, 0], results['groundtruth_classes'])
+    self.assertAllEqual(
+        [True, False], results['groundtruth_is_crowd'])
+    self.assertNDArrayNear(
+        [0.25 * image_height * image_width, 0.75 * image_height * image_width],
+        results['groundtruth_area'], 1e-4)
+    self.assertNDArrayNear(
+        [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
+        results['groundtruth_boxes'], 1e-4)
+    self.assertNDArrayNear(
+        mask_content, results['groundtruth_instance_masks'], 1e-4)
+    self.assertAllEqual(
+        masks, results['groundtruth_instance_masks_png'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/dataloaders/utils.py
+++ b/official/vision/beta/dataloaders/utils.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data loader utils."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.ops import preprocess_ops
+
+
+def process_source_id(source_id):
+  """Processes source_id to the right format."""
+  if source_id.dtype == tf.string:
+    source_id = tf.cast(tf.strings.to_number(source_id), tf.int32)
+  with tf.control_dependencies([source_id]):
+    source_id = tf.cond(
+        pred=tf.equal(tf.size(input=source_id), 0),
+        true_fn=lambda: tf.cast(tf.constant(-1), tf.int32),
+        false_fn=lambda: tf.identity(source_id))
+  return source_id
+
+
+def pad_groundtruths_to_fixed_size(groundtruths, size):
+  """Pads the first dimension of groundtruths labels to the fixed size."""
+  groundtruths['boxes'] = preprocess_ops.clip_or_pad_to_fixed_size(
+      groundtruths['boxes'], size, -1)
+  groundtruths['is_crowds'] = preprocess_ops.clip_or_pad_to_fixed_size(
+      groundtruths['is_crowds'], size, 0)
+  groundtruths['areas'] = preprocess_ops.clip_or_pad_to_fixed_size(
+      groundtruths['areas'], size, -1)
+  groundtruths['classes'] = preprocess_ops.clip_or_pad_to_fixed_size(
+      groundtruths['classes'], size, -1)
+  return groundtruths
--- a/official/vision/beta/dataloaders/video_input.py
+++ b/official/vision/beta/dataloaders/video_input.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Parser for video and label datasets."""
+
+from typing import Dict, Optional, Tuple
+
+from absl import logging
+import tensorflow as tf
+
+from official.vision.beta.configs import video_classification as exp_cfg
+from official.vision.beta.dataloaders import decoder
+from official.vision.beta.dataloaders import parser
+from official.vision.beta.ops import preprocess_ops_3d
+
+IMAGE_KEY = 'image/encoded'
+LABEL_KEY = 'clip/label/index'
+
+
+def _process_image(image: tf.Tensor,
+                   is_training: bool = True,
+                   num_frames: int = 32,
+                   stride: int = 1,
+                   num_test_clips: int = 1,
+                   min_resize: int = 224,
+                   crop_size: int = 200,
+                   zero_centering_image: bool = False,
+                   seed: Optional[int] = None) -> tf.Tensor:
+  """Processes a serialized image tensor.
+
+  Args:
+    image: Input Tensor of shape [timesteps] and type tf.string of serialized
+      frames.
+    is_training: Whether or not in training mode. If True, random sample, crop
+      and left right flip is used.
+    num_frames: Number of frames per subclip.
+    stride: Temporal stride to sample frames.
+    num_test_clips: Number of test clips (1 by default). If more than 1, this
+      will sample multiple linearly spaced clips within each video at test time.
+      If 1, then a single clip in the middle of the video is sampled. The clips
+      are aggreagated in the batch dimension.
+    min_resize: Frames are resized so that min(height, width) is min_resize.
+    crop_size: Final size of the frame after cropping the resized frames. Both
+      height and width are the same.
+    zero_centering_image: If True, frames are normalized to values in [-1, 1].
+      If False, values in [0, 1].
+    seed: A deterministic seed to use when sampling.
+
+  Returns:
+    Processed frames. Tensor of shape
+      [num_frames * num_test_clips, crop_size, crop_size, 3].
+  """
+  # Validate parameters.
+  if is_training and num_test_clips != 1:
+    logging.warning(
+        '`num_test_clips` %d is ignored since `is_training` is `True`.',
+        num_test_clips)
+
+  # Temporal sampler.
+  if is_training:
+    # Sample random clip.
+    image = preprocess_ops_3d.sample_sequence(image, num_frames, True, stride,
+                                              seed)
+  elif num_test_clips > 1:
+    # Sample linspace clips.
+    image = preprocess_ops_3d.sample_linspace_sequence(image, num_test_clips,
+                                                       num_frames, stride)
+  else:
+    # Sample middle clip.
+    image = preprocess_ops_3d.sample_sequence(image, num_frames, False, stride)
+
+  # Decode JPEG string to tf.uint8.
+  image = preprocess_ops_3d.decode_jpeg(image, 3)
+
+  # Resize images (resize happens only if necessary to save compute).
+  image = preprocess_ops_3d.resize_smallest(image, min_resize)
+
+  if is_training:
+    # Standard image data augmentation: random crop and random flip.
+    image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, True,
+                                         seed)
+    image = preprocess_ops_3d.random_flip_left_right(image, seed)
+  else:
+    # Central crop of the frames.
+    image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, False)
+
+  # Cast the frames in float32, normalizing according to zero_centering_image.
+  return preprocess_ops_3d.normalize_image(image, zero_centering_image)
+
+
+def _postprocess_image(image: tf.Tensor,
+                       is_training: bool = True,
+                       num_frames: int = 32,
+                       num_test_clips: int = 1) -> tf.Tensor:
+  """Processes a batched Tensor of frames.
+
+  The same parameters used in process should be used here.
+
+  Args:
+    image: Input Tensor of shape [batch, timesteps, height, width, 3].
+    is_training: Whether or not in training mode. If True, random sample, crop
+      and left right flip is used.
+    num_frames: Number of frames per subclip.
+    num_test_clips: Number of test clips (1 by default). If more than 1, this
+      will sample multiple linearly spaced clips within each video at test time.
+      If 1, then a single clip in the middle of the video is sampled. The clips
+      are aggreagated in the batch dimension.
+
+  Returns:
+    Processed frames. Tensor of shape
+      [batch * num_test_clips, num_frames, height, width, 3].
+  """
+  if num_test_clips > 1 and not is_training:
+    # In this case, multiple clips are merged together in batch dimenstion which
+    # will be B * num_test_clips.
+    image = tf.reshape(
+        image, (-1, num_frames, image.shape[2], image.shape[3], image.shape[4]))
+
+  return image
+
+
+def _process_label(label: tf.Tensor,
+                   one_hot_label: bool = True,
+                   num_classes: Optional[int] = None) -> tf.Tensor:
+  """Processes label Tensor."""
+  # Validate parameters.
+  if one_hot_label and not num_classes:
+    raise ValueError(
+        '`num_classes` should be given when requesting one hot label.')
+
+  # Cast to tf.int32.
+  label = tf.cast(label, dtype=tf.int32)
+
+  if one_hot_label:
+    # Replace label index by one hot representation.
+    label = tf.one_hot(label, num_classes)
+
+  return label
+
+
+class Decoder(decoder.Decoder):
+  """A tf.Example decoder for classification task."""
+
+  def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY):
+    self._image_key = IMAGE_KEY
+    self._label_key = LABEL_KEY
+    self._context_description = {
+        # One integer stored in context.
+        self._label_key: tf.io.FixedLenFeature((), tf.int64),
+    }
+    self._sequence_description = {
+        # Each image is a string encoding JPEG.
+        self._image_key: tf.io.FixedLenSequenceFeature((), tf.string),
+    }
+
+  def decode(self, serialized_example):
+    """Parses a single tf.Example into image and label tensors."""
+    context, sequences = tf.io.parse_single_sequence_example(
+        serialized_example, self._context_description,
+        self._sequence_description)
+    return {
+        self._image_key: sequences[self._image_key],
+        self._label_key: context[self._label_key]
+    }
+
+
+class Parser(parser.Parser):
+  """Parses a video and label dataset."""
+
+  def __init__(self,
+               input_params: exp_cfg.DataConfig,
+               image_key: str = IMAGE_KEY,
+               label_key: str = LABEL_KEY):
+    self._num_frames = input_params.feature_shape[0]
+    self._stride = input_params.temporal_stride
+    self._num_test_clips = input_params.num_test_clips
+    self._min_resize = input_params.min_image_size
+    self._crop_size = input_params.feature_shape[1]
+    self._one_hot_label = input_params.one_hot
+    self._num_classes = input_params.num_classes
+    self._image_key = image_key
+    self._label_key = label_key
+
+  def _parse_train_data(
+      self, decoded_tensors: Dict[str, tf.Tensor]
+  ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
+    """Parses data for training."""
+    # Process image and label.
+    image = decoded_tensors[self._image_key]
+    label = decoded_tensors[self._label_key]
+    image = _process_image(
+        image=image,
+        is_training=True,
+        num_frames=self._num_frames,
+        stride=self._stride,
+        num_test_clips=self._num_test_clips,
+        min_resize=self._min_resize,
+        crop_size=self._crop_size)
+    label = _process_label(label, self._one_hot_label, self._num_classes)
+
+    return {'image': image}, label
+
+  def _parse_eval_data(
+      self, decoded_tensors: Dict[str, tf.Tensor]
+  ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
+    """Parses data for evaluation."""
+    image = decoded_tensors[self._image_key]
+    label = decoded_tensors[self._label_key]
+    image = _process_image(
+        image=image,
+        is_training=False,
+        num_frames=self._num_frames,
+        stride=self._stride,
+        num_test_clips=self._num_test_clips,
+        min_resize=self._min_resize,
+        crop_size=self._crop_size)
+    label = _process_label(label, self._one_hot_label, self._num_classes)
+
+    return {'image': image}, label
+
+
+class PostBatchProcessor(object):
+  """Processes a video and label dataset which is batched."""
+
+  def __init__(self, input_params: exp_cfg.DataConfig):
+    self._is_training = input_params.is_training
+
+    self._num_frames = input_params.feature_shape[0]
+    self._num_test_clips = input_params.num_test_clips
+
+  def __call__(
+      self,
+      image: Dict[str, tf.Tensor],
+      label: tf.Tensor) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
+    """Parses a single tf.Example into image and label tensors."""
+    image = image['image']
+    image = _postprocess_image(
+        image=image,
+        is_training=self._is_training,
+        num_frames=self._num_frames,
+        num_test_clips=self._num_test_clips)
+
+    return {'image': image}, label
--- a/official/vision/beta/dataloaders/video_input_test.py
+++ b/official/vision/beta/dataloaders/video_input_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import io
+
+# Import libraries
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.beta.configs import video_classification as exp_cfg
+from official.vision.beta.dataloaders import video_input
+
+
+class DecoderTest(tf.test.TestCase):
+  """A tf.SequenceExample decoder for the video classification task."""
+
+  def test_decoder(self):
+    decoder = video_input.Decoder()
+
+    # Create fake data.
+    random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
+    random_image = Image.fromarray(random_image)
+    label = 42
+    with io.BytesIO() as buffer:
+      random_image.save(buffer, format='JPEG')
+      raw_image_bytes = buffer.getvalue()
+
+    seq_example = tf.train.SequenceExample()
+    seq_example.feature_lists.feature_list.get_or_create(
+        video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
+            raw_image_bytes
+        ]
+    seq_example.feature_lists.feature_list.get_or_create(
+        video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
+            raw_image_bytes
+        ]
+    seq_example.context.feature[video_input.LABEL_KEY].int64_list.value[:] = [
+        label
+    ]
+    serialized_example = seq_example.SerializeToString()
+
+    decoded_tensors = decoder.decode(tf.convert_to_tensor(serialized_example))
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+    self.assertCountEqual([video_input.IMAGE_KEY, video_input.LABEL_KEY],
+                          results.keys())
+    self.assertEqual(label, results[video_input.LABEL_KEY])
+
+
+class VideoAndLabelParserTest(tf.test.TestCase):
+
+  def test_video_input(self):
+    params = exp_cfg.kinetics600(is_training=True)
+    params.feature_shape = (2, 224, 224, 3)
+    params.min_image_size = 224
+    decoder = video_input.Decoder()
+    parser = video_input.Parser(params).parse_fn(params.is_training)
+
+    # Create fake data.
+    random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
+    random_image = Image.fromarray(random_image)
+    with io.BytesIO() as buffer:
+      random_image.save(buffer, format='JPEG')
+      raw_image_bytes = buffer.getvalue()
+
+    seq_example = tf.train.SequenceExample()
+    seq_example.feature_lists.feature_list.get_or_create(
+        video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
+            raw_image_bytes
+        ]
+    seq_example.feature_lists.feature_list.get_or_create(
+        video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
+            raw_image_bytes
+        ]
+    seq_example.context.feature[video_input.LABEL_KEY].int64_list.value[:] = [
+        42
+    ]
+
+    input_tensor = tf.constant(seq_example.SerializeToString())
+    decoded_tensors = decoder.decode(input_tensor)
+    output_tensor = parser(decoded_tensors)
+    image_features, label = output_tensor
+    image = image_features['image']
+
+    self.assertAllEqual(image.shape, (2, 224, 224, 3))
+    self.assertAllEqual(label.shape, (600,))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/evaluation/coco_evaluator.py
+++ b/official/vision/beta/evaluation/coco_evaluator.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The COCO-style evaluator.
+
+The following snippet demonstrates the use of interfaces:
+
+  evaluator = COCOEvaluator(...)
+  for _ in range(num_evals):
+    for _ in range(num_batches_per_eval):
+      predictions, groundtruth = predictor.predict(...)  # pop a batch.
+      evaluator.update_state(groundtruths, predictions)
+    evaluator.result()  # finish one full eval and reset states.
+
+See also: https://github.com/cocodataset/cocoapi/
+"""
+
+import atexit
+import tempfile
+# Import libraries
+from absl import logging
+import numpy as np
+from pycocotools import cocoeval
+import six
+import tensorflow as tf
+
+from official.vision.beta.evaluation import coco_utils
+
+
+class COCOEvaluator(object):
+  """COCO evaluation metric class."""
+
+  def __init__(self, annotation_file, include_mask, need_rescale_bboxes=True):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to COCO metrics_fn. The
+    _update_op() takes detections from each image and push them to
+    self.detections. The _evaluate() loads a JSON file in COCO annotation format
+    as the groundtruths and runs COCO evaluation.
+
+    Args:
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        If `annotation_file` is None, groundtruth annotations will be loaded
+        from the dataloader.
+      include_mask: a boolean to indicate whether or not to include the mask
+        eval.
+      need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
+        to absolute values (`image_info` is needed in this case).
+    """
+    if annotation_file:
+      if annotation_file.startswith('gs://'):
+        _, local_val_json = tempfile.mkstemp(suffix='.json')
+        tf.io.gfile.remove(local_val_json)
+
+        tf.io.gfile.copy(annotation_file, local_val_json)
+        atexit.register(tf.io.gfile.remove, local_val_json)
+      else:
+        local_val_json = annotation_file
+      self._coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if include_mask else 'box'),
+          annotation_file=local_val_json)
+    self._annotation_file = annotation_file
+    self._include_mask = include_mask
+    self._metric_names = [
+        'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', 'ARmax10',
+        'ARmax100', 'ARs', 'ARm', 'ARl'
+    ]
+    self._required_prediction_fields = [
+        'source_id', 'num_detections', 'detection_classes', 'detection_scores',
+        'detection_boxes'
+    ]
+    self._need_rescale_bboxes = need_rescale_bboxes
+    if self._need_rescale_bboxes:
+      self._required_prediction_fields.append('image_info')
+    self._required_groundtruth_fields = [
+        'source_id', 'height', 'width', 'classes', 'boxes'
+    ]
+    if self._include_mask:
+      mask_metric_names = ['mask_' + x for x in self._metric_names]
+      self._metric_names.extend(mask_metric_names)
+      self._required_prediction_fields.extend(['detection_masks'])
+      self._required_groundtruth_fields.extend(['masks'])
+
+    self.reset_states()
+
+  @property
+  def name(self):
+    return 'coco_metric'
+
+  def reset_states(self):
+    """Resets internal states for a fresh run."""
+    self._predictions = {}
+    if not self._annotation_file:
+      self._groundtruths = {}
+
+  def result(self):
+    """Evaluates detection results, and reset_states."""
+    metric_dict = self.evaluate()
+    # Cleans up the internal variables in order for a fresh eval next time.
+    self.reset_states()
+    return metric_dict
+
+  def evaluate(self):
+    """Evaluates with detections from all images with COCO API.
+
+    Returns:
+      coco_metric: float numpy array with shape [24] representing the
+        coco-style evaluation metrics (box and mask).
+    """
+    if not self._annotation_file:
+      logging.info('There is no annotation_file in COCOEvaluator.')
+      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
+          self._groundtruths)
+      coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if self._include_mask else 'box'),
+          gt_dataset=gt_dataset)
+    else:
+      logging.info('Using annotation file: %s', self._annotation_file)
+      coco_gt = self._coco_gt
+    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
+        self._predictions)
+    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
+    image_ids = [ann['image_id'] for ann in coco_predictions]
+
+    coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox')
+    coco_eval.params.imgIds = image_ids
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm')
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      mask_coco_metrics = mcoco_eval.stats
+
+    if self._include_mask:
+      metrics = np.hstack((coco_metrics, mask_coco_metrics))
+    else:
+      metrics = coco_metrics
+
+    metrics_dict = {}
+    for i, name in enumerate(self._metric_names):
+      metrics_dict[name] = metrics[i].astype(np.float32)
+    return metrics_dict
+
+  def _process_predictions(self, predictions):
+    image_scale = np.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2))
+    predictions['detection_boxes'] = (
+        predictions['detection_boxes'].astype(np.float32))
+    predictions['detection_boxes'] /= image_scale
+    if 'detection_outer_boxes' in predictions:
+      predictions['detection_outer_boxes'] = (
+          predictions['detection_outer_boxes'].astype(np.float32))
+      predictions['detection_outer_boxes'] /= image_scale
+
+  def _convert_to_numpy(self, groundtruths, predictions):
+    """Converts tesnors to numpy arrays."""
+    if groundtruths:
+      labels = tf.nest.map_structure(lambda x: x.numpy(), groundtruths)
+      numpy_groundtruths = {}
+      for key, val in labels.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_groundtruths[key] = val
+    else:
+      numpy_groundtruths = groundtruths
+
+    if predictions:
+      outputs = tf.nest.map_structure(lambda x: x.numpy(), predictions)
+      numpy_predictions = {}
+      for key, val in outputs.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_predictions[key] = val
+    else:
+      numpy_predictions = predictions
+
+    return numpy_groundtruths, numpy_predictions
+
+  def update_state(self, groundtruths, predictions):
+    """Update and aggregate detection results and groundtruth data.
+
+    Args:
+      groundtruths: a dictionary of Tensors including the fields below.
+        See also different parsers under `../dataloader` for more details.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - height: a numpy array of int of shape [batch_size].
+          - width: a numpy array of int of shape [batch_size].
+          - num_detections: a numpy array of int of shape [batch_size].
+          - boxes: a numpy array of float of shape [batch_size, K, 4].
+          - classes: a numpy array of int of shape [batch_size, K].
+        Optional fields:
+          - is_crowds: a numpy array of int of shape [batch_size, K]. If the
+              field is absent, it is assumed that this instance is not crowd.
+          - areas: a numy array of float of shape [batch_size, K]. If the
+              field is absent, the area is calculated using either boxes or
+              masks depending on which one is available.
+          - masks: a numpy array of float of shape
+              [batch_size, K, mask_height, mask_width],
+      predictions: a dictionary of tensors including the fields below.
+        See different parsers under `../dataloader` for more details.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - image_info [if `need_rescale_bboxes` is True]: a numpy array of
+            float of shape [batch_size, 4, 2].
+          - num_detections: a numpy array of
+            int of shape [batch_size].
+          - detection_boxes: a numpy array of float of shape [batch_size, K, 4].
+          - detection_classes: a numpy array of int of shape [batch_size, K].
+          - detection_scores: a numpy array of float of shape [batch_size, K].
+        Optional fields:
+          - detection_masks: a numpy array of float of shape
+              [batch_size, K, mask_height, mask_width].
+    Raises:
+      ValueError: if the required prediction or groundtruth fields are not
+        present in the incoming `predictions` or `groundtruths`.
+    """
+    groundtruths, predictions = self._convert_to_numpy(groundtruths,
+                                                       predictions)
+    for k in self._required_prediction_fields:
+      if k not in predictions:
+        raise ValueError(
+            'Missing the required key `{}` in predictions!'.format(k))
+    if self._need_rescale_bboxes:
+      self._process_predictions(predictions)
+    for k, v in six.iteritems(predictions):
+      if k not in self._predictions:
+        self._predictions[k] = [v]
+      else:
+        self._predictions[k].append(v)
+
+    if not self._annotation_file:
+      assert groundtruths
+      for k in self._required_groundtruth_fields:
+        if k not in groundtruths:
+          raise ValueError(
+              'Missing the required key `{}` in groundtruths!'.format(k))
+      for k, v in six.iteritems(groundtruths):
+        if k not in self._groundtruths:
+          self._groundtruths[k] = [v]
+        else:
+          self._groundtruths[k].append(v)
--- a/official/vision/beta/evaluation/coco_evaluator_test.py
+++ b/official/vision/beta/evaluation/coco_evaluator_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for coco_evaluator."""
+
+import io
+import os
+
+# Import libraries
+
+from absl import logging
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+from pycocotools.coco import COCO
+import six
+import tensorflow as tf
+
+from official.vision.beta.evaluation import coco_evaluator
+from official.vision.beta.evaluation import coco_utils
+
+_COCO_JSON_FILE = '/placer/prod/home/snaggletooth/test/data/coco/instances_val2017.json'
+_SAVED_COCO_JSON_FILE = 'tmp.json'
+
+
+def get_groundtruth_annotations(image_id, coco, include_mask=False):
+  anns = coco.loadAnns(coco.getAnnIds([image_id]))
+  if not anns:
+    return None
+
+  image = coco.loadImgs([image_id])[0]
+
+  groundtruths = {
+      'boxes': [],
+      'classes': [],
+      'is_crowds': [],
+      'areas': [],
+  }
+  if include_mask:
+    groundtruths['masks'] = []
+  for ann in anns:
+    # Creates detections from groundtruths.
+    # Converts [x, y, w, h] to [y1, x1, y2, x2] box format.
+    box = [ann['bbox'][1],
+           ann['bbox'][0],
+           (ann['bbox'][1] + ann['bbox'][3]),
+           (ann['bbox'][0] + ann['bbox'][2])]
+
+    # Creates groundtruths.
+    groundtruths['boxes'].append(box)
+    groundtruths['classes'].append(ann['category_id'])
+    groundtruths['is_crowds'].append(ann['iscrowd'])
+    groundtruths['areas'].append(ann['area'])
+    if include_mask:
+      mask_img = Image.fromarray(coco.annToMask(ann).astype(np.uint8))
+      with io.BytesIO() as stream:
+        mask_img.save(stream, format='PNG')
+        mask_bytes = stream.getvalue()
+      groundtruths['masks'].append(mask_bytes)
+  for key, val in groundtruths.items():
+    groundtruths[key] = np.stack(val, axis=0)
+  groundtruths['source_id'] = image['id']
+  groundtruths['height'] = image['height']
+  groundtruths['width'] = image['width']
+  groundtruths['num_detections'] = len(anns)
+
+  for k, v in six.iteritems(groundtruths):
+    groundtruths[k] = np.expand_dims(v, axis=0)
+  return groundtruths
+
+
+def get_predictions(image_id, coco, include_mask=False):
+  anns = coco.loadAnns(coco.getAnnIds([image_id]))
+  if not anns:
+    return None
+
+  image = coco.loadImgs([image_id])[0]
+
+  predictions = {
+      'detection_boxes': [],
+      'detection_classes': [],
+      'detection_scores': [],
+  }
+  if include_mask:
+    predictions['detection_masks'] = []
+  for ann in anns:
+    # Creates detections from groundtruths.
+    # Converts [x, y, w, h] to [y1, x1, y2, x2] box format and
+    # does the denormalization.
+    box = [ann['bbox'][1],
+           ann['bbox'][0],
+           (ann['bbox'][1] + ann['bbox'][3]),
+           (ann['bbox'][0] + ann['bbox'][2])]
+
+    predictions['detection_boxes'].append(box)
+    predictions['detection_classes'].append(ann['category_id'])
+    predictions['detection_scores'].append(1)
+    if include_mask:
+      mask = coco.annToMask(ann)
+      predictions['detection_masks'].append(mask)
+  for key, val in predictions.items():
+    predictions[key] = np.expand_dims(np.stack(val, axis=0), axis=0)
+
+  predictions['source_id'] = np.array([image['id']])
+  predictions['num_detections'] = np.array([len(anns)])
+  predictions['image_info'] = np.array(
+      [[[image['height'], image['width']],
+        [image['height'], image['width']],
+        [1, 1],
+        [0, 0]]], dtype=np.float32)
+
+  return predictions
+
+
+def get_fake_predictions(image_id, coco, include_mask=False):
+  anns = coco.loadAnns(coco.getAnnIds([image_id]))
+  if not anns:
+    return None
+
+  label_id_max = max([ann['category_id'] for ann in anns])
+
+  image = coco.loadImgs([image_id])[0]
+
+  num_detections = 100
+  xmin = np.random.randint(
+      low=0, high=int(image['width'] / 2), size=(1, num_detections))
+  xmax = np.random.randint(
+      low=int(image['width'] / 2), high=image['width'],
+      size=(1, num_detections))
+  ymin = np.random.randint(
+      low=0, high=int(image['height'] / 2), size=(1, num_detections))
+  ymax = np.random.randint(
+      low=int(image['height'] / 2), high=image['height'],
+      size=(1, num_detections))
+  predictions = {
+      'detection_boxes': np.stack([ymin, xmin, ymax, xmax], axis=-1),
+      'detection_classes': np.random.randint(
+          low=0, high=(label_id_max + 1), size=(1, num_detections)),
+      'detection_scores': np.random.random(size=(1, num_detections)),
+  }
+  if include_mask:
+    predictions['detection_masks'] = np.random.randint(
+        1, size=(1, num_detections, image['height'], image['width']))
+  predictions['source_id'] = np.array([image['id']])
+  predictions['num_detections'] = np.array([num_detections])
+  predictions['image_info'] = np.array(
+      [[[image['height'], image['width']],
+        [image['height'], image['width']],
+        [1, 1],
+        [0, 0]]], dtype=np.float32)
+
+  return predictions
+
+
+class DummyGroundtruthGenerator(object):
+
+  def __init__(self, include_mask, image_id, coco):
+    self._include_mask = include_mask
+    self._image_id = image_id
+    self._coco = coco
+
+  def __call__(self):
+    yield get_groundtruth_annotations(
+        self._image_id, self._coco, self._include_mask)
+
+
+class COCOEvaluatorTest(parameterized.TestCase, absltest.TestCase):
+
+  def setUp(self):
+    super(COCOEvaluatorTest, self).setUp()
+    temp = self.create_tempdir()
+    self._saved_coco_json_file = os.path.join(temp.full_path,
+                                              _SAVED_COCO_JSON_FILE)
+
+  def tearDown(self):
+    super(COCOEvaluatorTest, self).tearDown()
+
+  @parameterized.parameters(
+      (False, False), (False, True), (True, False), (True, True))
+  def testEval(self, include_mask, use_fake_predictions):
+    coco = COCO(annotation_file=_COCO_JSON_FILE)
+    index = np.random.randint(len(coco.dataset['images']))
+    image_id = coco.dataset['images'][index]['id']
+    # image_id = 26564
+    # image_id = 324158
+    if use_fake_predictions:
+      predictions = get_fake_predictions(
+          image_id, coco, include_mask=include_mask)
+    else:
+      predictions = get_predictions(image_id, coco, include_mask=include_mask)
+
+    if not predictions:
+      logging.info('Empty predictions for index=%d', index)
+      return
+
+    predictions = tf.nest.map_structure(
+        lambda x: tf.convert_to_tensor(x) if x is not None else None,
+        predictions)
+
+    evaluator_w_json = coco_evaluator.COCOEvaluator(
+        annotation_file=_COCO_JSON_FILE, include_mask=include_mask)
+    evaluator_w_json.update_state(groundtruths=None, predictions=predictions)
+    results_w_json = evaluator_w_json.result()
+
+    dummy_generator = DummyGroundtruthGenerator(
+        include_mask=include_mask, image_id=image_id, coco=coco)
+    coco_utils.generate_annotation_file(dummy_generator,
+                                        self._saved_coco_json_file)
+    evaluator_no_json = coco_evaluator.COCOEvaluator(
+        annotation_file=self._saved_coco_json_file, include_mask=include_mask)
+    evaluator_no_json.update_state(groundtruths=None, predictions=predictions)
+    results_no_json = evaluator_no_json.result()
+
+    for k, v in results_w_json.items():
+      self.assertEqual(v, results_no_json[k])
+
+  @parameterized.parameters(
+      (False, False), (False, True), (True, False), (True, True))
+  def testEvalOnTheFly(self, include_mask, use_fake_predictions):
+    coco = COCO(annotation_file=_COCO_JSON_FILE)
+    index = np.random.randint(len(coco.dataset['images']))
+    image_id = coco.dataset['images'][index]['id']
+    # image_id = 26564
+    # image_id = 324158
+    if use_fake_predictions:
+      predictions = get_fake_predictions(
+          image_id, coco, include_mask=include_mask)
+    else:
+      predictions = get_predictions(image_id, coco, include_mask=include_mask)
+
+    if not predictions:
+      logging.info('Empty predictions for index=%d', index)
+      return
+
+    predictions = tf.nest.map_structure(
+        lambda x: tf.convert_to_tensor(x) if x is not None else None,
+        predictions)
+    evaluator_w_json = coco_evaluator.COCOEvaluator(
+        annotation_file=_COCO_JSON_FILE, include_mask=include_mask)
+    evaluator_w_json.update_state(groundtruths=None, predictions=predictions)
+    results_w_json = evaluator_w_json.result()
+
+    groundtruths = get_groundtruth_annotations(image_id, coco, include_mask)
+    groundtruths = tf.nest.map_structure(
+        lambda x: tf.convert_to_tensor(x) if x is not None else None,
+        groundtruths)
+
+    evaluator_no_json = coco_evaluator.COCOEvaluator(
+        annotation_file=None, include_mask=include_mask)
+    evaluator_no_json.update_state(groundtruths, predictions)
+    results_no_json = evaluator_no_json.result()
+
+    for k, v in results_w_json.items():
+      self.assertEqual(v, results_no_json[k])
+
+
+if __name__ == '__main__':
+  absltest.main()
--- a/official/vision/beta/evaluation/coco_utils.py
+++ b/official/vision/beta/evaluation/coco_utils.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Util functions related to pycocotools and COCO eval."""
+
+import copy
+import json
+
+# Import libraries
+from absl import logging
+import numpy as np
+from PIL import Image
+from pycocotools import coco
+from pycocotools import mask as mask_api
+import six
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import tf_example_decoder
+from official.vision.beta.ops import box_ops
+from official.vision.beta.ops import mask_ops
+
+
+class COCOWrapper(coco.COCO):
+  """COCO wrapper class.
+
+  This class wraps COCO API object, which provides the following additional
+  functionalities:
+    1. Support string type image id.
+    2. Support loading the groundtruth dataset using the external annotation
+       dictionary.
+    3. Support loading the prediction results using the external annotation
+       dictionary.
+  """
+
+  def __init__(self, eval_type='box', annotation_file=None, gt_dataset=None):
+    """Instantiates a COCO-style API object.
+
+    Args:
+      eval_type: either 'box' or 'mask'.
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        This is required if `gt_dataset` is not provided.
+      gt_dataset: the groundtruth eval datatset in COCO API format.
+    """
+    if ((annotation_file and gt_dataset) or
+        ((not annotation_file) and (not gt_dataset))):
+      raise ValueError('One and only one of `annotation_file` and `gt_dataset` '
+                       'needs to be specified.')
+
+    if eval_type not in ['box', 'mask']:
+      raise ValueError('The `eval_type` can only be either `box` or `mask`.')
+
+    coco.COCO.__init__(self, annotation_file=annotation_file)
+    self._eval_type = eval_type
+    if gt_dataset:
+      self.dataset = gt_dataset
+      self.createIndex()
+
+  def loadRes(self, predictions):
+    """Loads result file and return a result api object.
+
+    Args:
+      predictions: a list of dictionary each representing an annotation in COCO
+        format. The required fields are `image_id`, `category_id`, `score`,
+        `bbox`, `segmentation`.
+
+    Returns:
+      res: result COCO api object.
+
+    Raises:
+      ValueError: if the set of image id from predctions is not the subset of
+        the set of image id of the groundtruth dataset.
+    """
+    res = coco.COCO()
+    res.dataset['images'] = copy.deepcopy(self.dataset['images'])
+    res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+
+    image_ids = [ann['image_id'] for ann in predictions]
+    if set(image_ids) != (set(image_ids) & set(self.getImgIds())):
+      raise ValueError('Results do not correspond to the current dataset!')
+    for ann in predictions:
+      x1, x2, y1, y2 = [ann['bbox'][0], ann['bbox'][0] + ann['bbox'][2],
+                        ann['bbox'][1], ann['bbox'][1] + ann['bbox'][3]]
+      if self._eval_type == 'box':
+        ann['area'] = ann['bbox'][2] * ann['bbox'][3]
+        ann['segmentation'] = [
+            [x1, y1, x1, y2, x2, y2, x2, y1]]
+      elif self._eval_type == 'mask':
+        ann['area'] = mask_api.area(ann['segmentation'])
+
+    res.dataset['annotations'] = copy.deepcopy(predictions)
+    res.createIndex()
+    return res
+
+
+def convert_predictions_to_coco_annotations(predictions):
+  """Converts a batch of predictions to annotations in COCO format.
+
+  Args:
+    predictions: a dictionary of lists of numpy arrays including the following
+      fields. K below denotes the maximum number of instances per image.
+      Required fields:
+        - source_id: a list of numpy arrays of int or string of shape
+            [batch_size].
+        - num_detections: a list of numpy arrays of int of shape [batch_size].
+        - detection_boxes: a list of numpy arrays of float of shape
+            [batch_size, K, 4], where coordinates are in the original image
+            space (not the scaled image space).
+        - detection_classes: a list of numpy arrays of int of shape
+            [batch_size, K].
+        - detection_scores: a list of numpy arrays of float of shape
+            [batch_size, K].
+      Optional fields:
+        - detection_masks: a list of numpy arrays of float of shape
+            [batch_size, K, mask_height, mask_width].
+
+  Returns:
+    coco_predictions: prediction in COCO annotation format.
+  """
+  coco_predictions = []
+  num_batches = len(predictions['source_id'])
+  batch_size = predictions['source_id'][0].shape[0]
+  max_num_detections = predictions['detection_classes'][0].shape[1]
+  use_outer_box = 'detection_outer_boxes' in predictions
+  for i in range(num_batches):
+    predictions['detection_boxes'][i] = box_ops.yxyx_to_xywh(
+        predictions['detection_boxes'][i])
+    if use_outer_box:
+      predictions['detection_outer_boxes'][i] = box_ops.yxyx_to_xywh(
+          predictions['detection_outer_boxes'][i])
+      mask_boxes = predictions['detection_outer_boxes']
+    else:
+      mask_boxes = predictions['detection_boxes']
+
+    for j in range(batch_size):
+      if 'detection_masks' in predictions:
+        image_masks = mask_ops.paste_instance_masks(
+            predictions['detection_masks'][i][j],
+            mask_boxes[i][j],
+            int(predictions['image_info'][i][j, 0, 0]),
+            int(predictions['image_info'][i][j, 0, 1]))
+        binary_masks = (image_masks > 0.0).astype(np.uint8)
+        encoded_masks = [
+            mask_api.encode(np.asfortranarray(binary_mask))
+            for binary_mask in list(binary_masks)]
+      for k in range(max_num_detections):
+        ann = {}
+        ann['image_id'] = predictions['source_id'][i][j]
+        ann['category_id'] = predictions['detection_classes'][i][j, k]
+        ann['bbox'] = predictions['detection_boxes'][i][j, k]
+        ann['score'] = predictions['detection_scores'][i][j, k]
+        if 'detection_masks' in predictions:
+          ann['segmentation'] = encoded_masks[k]
+        coco_predictions.append(ann)
+
+  for i, ann in enumerate(coco_predictions):
+    ann['id'] = i + 1
+
+  return coco_predictions
+
+
+def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
+  """Converts groundtruths to the dataset in COCO format.
+
+  Args:
+    groundtruths: a dictionary of numpy arrays including the fields below.
+      Note that each element in the list represent the number for a single
+      example without batch dimension. K below denotes the actual number of
+      instances for each image.
+      Required fields:
+        - source_id: a list of numpy arrays of int or string of shape
+          [batch_size].
+        - height: a list of numpy arrays of int of shape [batch_size].
+        - width: a list of numpy arrays of int of shape [batch_size].
+        - num_detections: a list of numpy arrays of int of shape [batch_size].
+        - boxes: a list of numpy arrays of float of shape [batch_size, K, 4],
+            where coordinates are in the original image space (not the
+            normalized coordinates).
+        - classes: a list of numpy arrays of int of shape [batch_size, K].
+      Optional fields:
+        - is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If
+            th field is absent, it is assumed that this instance is not crowd.
+        - areas: a list of numy arrays of float of shape [batch_size, K]. If the
+            field is absent, the area is calculated using either boxes or
+            masks depending on which one is available.
+        - masks: a list of numpy arrays of string of shape [batch_size, K],
+    label_map: (optional) a dictionary that defines items from the category id
+      to the category name. If `None`, collect the category mappping from the
+      `groundtruths`.
+
+  Returns:
+    coco_groundtruths: the groundtruth dataset in COCO format.
+  """
+  source_ids = np.concatenate(groundtruths['source_id'], axis=0)
+  heights = np.concatenate(groundtruths['height'], axis=0)
+  widths = np.concatenate(groundtruths['width'], axis=0)
+  gt_images = [{'id': int(i), 'height': int(h), 'width': int(w)} for i, h, w
+               in zip(source_ids, heights, widths)]
+
+  gt_annotations = []
+  num_batches = len(groundtruths['source_id'])
+  batch_size = groundtruths['source_id'][0].shape[0]
+  for i in range(num_batches):
+    for j in range(batch_size):
+      num_instances = groundtruths['num_detections'][i][j]
+      for k in range(int(num_instances)):
+        ann = {}
+        ann['image_id'] = int(groundtruths['source_id'][i][j])
+        if 'is_crowds' in groundtruths:
+          ann['iscrowd'] = int(groundtruths['is_crowds'][i][j, k])
+        else:
+          ann['iscrowd'] = 0
+        ann['category_id'] = int(groundtruths['classes'][i][j, k])
+        boxes = groundtruths['boxes'][i]
+        ann['bbox'] = [
+            float(boxes[j, k, 1]),
+            float(boxes[j, k, 0]),
+            float(boxes[j, k, 3] - boxes[j, k, 1]),
+            float(boxes[j, k, 2] - boxes[j, k, 0])]
+        if 'areas' in groundtruths:
+          ann['area'] = float(groundtruths['areas'][i][j, k])
+        else:
+          ann['area'] = float(
+              (boxes[j, k, 3] - boxes[j, k, 1]) *
+              (boxes[j, k, 2] - boxes[j, k, 0]))
+        if 'masks' in groundtruths:
+          mask = Image.open(six.BytesIO(groundtruths['masks'][i][j, k]))
+          width, height = mask.size
+          np_mask = (
+              np.array(mask.getdata()).reshape(height, width).astype(np.uint8))
+          np_mask[np_mask > 0] = 255
+          encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
+          ann['segmentation'] = encoded_mask
+          if 'areas' not in groundtruths:
+            ann['area'] = mask_api.area(encoded_mask)
+        gt_annotations.append(ann)
+
+  for i, ann in enumerate(gt_annotations):
+    ann['id'] = i + 1
+
+  if label_map:
+    gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map]
+  else:
+    category_ids = [gt['category_id'] for gt in gt_annotations]
+    gt_categories = [{'id': i} for i in set(category_ids)]
+
+  gt_dataset = {
+      'images': gt_images,
+      'categories': gt_categories,
+      'annotations': copy.deepcopy(gt_annotations),
+  }
+  return gt_dataset
+
+
+class COCOGroundtruthGenerator:
+  """Generates the groundtruth annotations from a single example."""
+
+  def __init__(self, file_pattern, num_examples, include_mask):
+    self._file_pattern = file_pattern
+    self._num_examples = num_examples
+    self._include_mask = include_mask
+    self._dataset_fn = tf.data.TFRecordDataset
+
+  def _parse_single_example(self, example):
+    """Parses a single serialized tf.Example proto.
+
+    Args:
+      example: a serialized tf.Example proto string.
+
+    Returns:
+      A dictionary of groundtruth with the following fields:
+        source_id: a scalar tensor of int64 representing the image source_id.
+        height: a scalar tensor of int64 representing the image height.
+        width: a scalar tensor of int64 representing the image width.
+        boxes: a float tensor of shape [K, 4], representing the groundtruth
+          boxes in absolute coordinates with respect to the original image size.
+        classes: a int64 tensor of shape [K], representing the class labels of
+          each instances.
+        is_crowds: a bool tensor of shape [K], indicating whether the instance
+          is crowd.
+        areas: a float tensor of shape [K], indicating the area of each
+          instance.
+        masks: a string tensor of shape [K], containing the bytes of the png
+          mask of each instance.
+    """
+    decoder = tf_example_decoder.TfExampleDecoder(
+        include_mask=self._include_mask)
+    decoded_tensors = decoder.decode(example)
+
+    image = decoded_tensors['image']
+    image_size = tf.shape(image)[0:2]
+    boxes = box_ops.denormalize_boxes(
+        decoded_tensors['groundtruth_boxes'], image_size)
+    groundtruths = {
+        'source_id': tf.string_to_number(
+            decoded_tensors['source_id'], out_type=tf.int64),
+        'height': decoded_tensors['height'],
+        'width': decoded_tensors['width'],
+        'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0],
+        'boxes': boxes,
+        'classes': decoded_tensors['groundtruth_classes'],
+        'is_crowds': decoded_tensors['groundtruth_is_crowd'],
+        'areas': decoded_tensors['groundtruth_area'],
+    }
+    if self._include_mask:
+      groundtruths.update({
+          'masks': decoded_tensors['groundtruth_instance_masks_png'],
+      })
+    return groundtruths
+
+  def _build_pipeline(self):
+    """Builds data pipeline to generate groundtruth annotations."""
+    dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
+    dataset = dataset.interleave(
+        map_func=lambda filename: self._dataset_fn(filename).prefetch(1),
+        cycle_length=12,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    dataset = dataset.map(self._parse_single_example,
+                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    dataset = dataset.batch(1, drop_remainder=False)
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+    return dataset
+
+  def __call__(self):
+    for groundtruth_result in self._build_pipeline():
+      yield groundtruth_result
+
+
+def scan_and_generator_annotation_file(file_pattern: str,
+                                       num_samples: int,
+                                       include_mask: bool,
+                                       annotation_file: str):
+  """Scans and generate the COCO-style annotation JSON file given a dataset."""
+  groundtruth_generator = COCOGroundtruthGenerator(
+      file_pattern, num_samples, include_mask)
+  generate_annotation_file(groundtruth_generator, annotation_file)
+
+
+def generate_annotation_file(groundtruth_generator,
+                             annotation_file):
+  """Generates COCO-style annotation JSON file given a groundtruth generator."""
+  groundtruths = {}
+  logging.info('Loading groundtruth annotations from dataset to memory...')
+  for groundtruth in groundtruth_generator():
+    for k, v in six.iteritems(groundtruth):
+      if k not in groundtruths:
+        groundtruths[k] = [v]
+      else:
+        groundtruths[k].append(v)
+  gt_dataset = convert_groundtruths_to_coco_dataset(groundtruths)
+
+  logging.info('Saving groundtruth annotations to the JSON file...')
+  with tf.io.gfile.GFile(annotation_file, 'w') as f:
+    f.write(json.dumps(gt_dataset))
+  logging.info('Done saving the JSON file...')
--- a/official/vision/beta/losses/maskrcnn_losses.py
+++ b/official/vision/beta/losses/maskrcnn_losses.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Losses for maskrcn model."""
+
+# Import libraries
+import tensorflow as tf
+
+
+class RpnScoreLoss(object):
+  """Region Proposal Network score loss function."""
+
+  def __init__(self, rpn_batch_size_per_im):
+    self._rpn_batch_size_per_im = rpn_batch_size_per_im
+    self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
+        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
+
+  def __call__(self, score_outputs, labels):
+    """Computes total RPN detection loss.
+
+    Computes total RPN detection loss including box and score from all levels.
+
+    Args:
+      score_outputs: an OrderDict with keys representing levels and values
+        representing scores in [batch_size, height, width, num_anchors].
+      labels: the dictionary that returned from dataloader that includes
+        groundturth targets.
+
+    Returns:
+      rpn_score_loss: a scalar tensor representing total score loss.
+    """
+    with tf.name_scope('rpn_loss'):
+      levels = sorted(score_outputs.keys())
+
+      score_losses = []
+      for level in levels:
+        score_losses.append(
+            self._rpn_score_loss(
+                score_outputs[level],
+                labels[level],
+                normalizer=tf.cast(
+                    tf.shape(score_outputs[level])[0] *
+                    self._rpn_batch_size_per_im,
+                    dtype=tf.float32)))
+
+      # Sums per level losses to total loss.
+      return tf.math.add_n(score_losses)
+
+  def _rpn_score_loss(self, score_outputs, score_targets, normalizer=1.0):
+    """Computes score loss."""
+    # score_targets has three values:
+    # (1) score_targets[i]=1, the anchor is a positive sample.
+    # (2) score_targets[i]=0, negative.
+    # (3) score_targets[i]=-1, the anchor is don't care (ignore).
+    with tf.name_scope('rpn_score_loss'):
+      mask = tf.math.logical_or(tf.math.equal(score_targets, 1),
+                                tf.math.equal(score_targets, 0))
+
+      score_targets = tf.math.maximum(score_targets,
+                                      tf.zeros_like(score_targets))
+
+      score_targets = tf.expand_dims(score_targets, axis=-1)
+      score_outputs = tf.expand_dims(score_outputs, axis=-1)
+      score_loss = self._binary_crossentropy(
+          score_targets, score_outputs, sample_weight=mask)
+
+      score_loss /= normalizer
+      return score_loss
+
+
+class RpnBoxLoss(object):
+  """Region Proposal Network box regression loss function."""
+
+  def __init__(self, huber_loss_delta: float):
+    # The delta is typically around the mean value of regression target.
+    # for instances, the regression targets of 512x512 input with 6 anchors on
+    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+    self._huber_loss = tf.keras.losses.Huber(
+        delta=huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
+
+  def __call__(self, box_outputs, labels):
+    """Computes total RPN detection loss.
+
+    Computes total RPN detection loss including box and score from all levels.
+
+    Args:
+      box_outputs: an OrderDict with keys representing levels and values
+        representing box regression targets in
+        [batch_size, height, width, num_anchors * 4].
+      labels: the dictionary that returned from dataloader that includes
+        groundturth targets.
+
+    Returns:
+      rpn_box_loss: a scalar tensor representing total box regression loss.
+    """
+    with tf.name_scope('rpn_loss'):
+      levels = sorted(box_outputs.keys())
+
+      box_losses = []
+      for level in levels:
+        box_losses.append(self._rpn_box_loss(box_outputs[level], labels[level]))
+
+      # Sum per level losses to total loss.
+      return tf.add_n(box_losses)
+
+  def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0):
+    """Computes box regression loss."""
+    with tf.name_scope('rpn_box_loss'):
+      mask = tf.cast(tf.not_equal(box_targets, 0.0), dtype=tf.float32)
+      box_targets = tf.expand_dims(box_targets, axis=-1)
+      box_outputs = tf.expand_dims(box_outputs, axis=-1)
+      box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
+      # The loss is normalized by the sum of non-zero weights and additional
+      # normalizer provided by the function caller. Using + 0.01 here to avoid
+      # division by zero.
+      box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
+      return box_loss
+
+
+class FastrcnnClassLoss(object):
+  """Fast R-CNN classification loss function."""
+
+  def __init__(self):
+    self._categorical_crossentropy = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
+
+  def __call__(self, class_outputs, class_targets):
+    """Computes the class loss (Fast-RCNN branch) of Mask-RCNN.
+
+    This function implements the classification loss of the Fast-RCNN.
+
+    The classification loss is softmax on all RoIs.
+    Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py  # pylint: disable=line-too-long
+
+    Args:
+      class_outputs: a float tensor representing the class prediction for each box
+        with a shape of [batch_size, num_boxes, num_classes].
+      class_targets: a float tensor representing the class label for each box
+        with a shape of [batch_size, num_boxes].
+
+    Returns:
+      a scalar tensor representing total class loss.
+    """
+    with tf.name_scope('fast_rcnn_loss'):
+      batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
+      class_targets = tf.cast(class_targets, dtype=tf.int32)
+      class_targets_one_hot = tf.one_hot(class_targets, num_classes)
+      return self._fast_rcnn_class_loss(class_outputs, class_targets_one_hot,
+                                        normalizer=batch_size * num_boxes)
+
+  def _fast_rcnn_class_loss(self, class_outputs, class_targets_one_hot,
+                            normalizer=1.0):
+    """Computes classification loss."""
+    with tf.name_scope('fast_rcnn_class_loss'):
+      class_loss = self._categorical_crossentropy(class_targets_one_hot,
+                                                  class_outputs)
+
+      class_loss /= normalizer
+      return class_loss
+
+
+class FastrcnnBoxLoss(object):
+  """Fast R-CNN box regression loss function."""
+
+  def __init__(self, huber_loss_delta: float):
+    # The delta is typically around the mean value of regression target.
+    # for instances, the regression targets of 512x512 input with 6 anchors on
+    # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+
+    self._huber_loss = tf.keras.losses.Huber(
+        delta=huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM)
+
+  def __call__(self, box_outputs, class_targets, box_targets):
+    """Computes the box loss (Fast-RCNN branch) of Mask-RCNN.
+
+    This function implements the box regression loss of the Fast-RCNN. As the
+    `box_outputs` produces `num_classes` boxes for each RoI, the reference model
+    expands `box_targets` to match the shape of `box_outputs` and selects only
+    the target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py)  # pylint: disable=line-too-long
+    Instead, this function selects the `box_outputs` by the `class_targets` so
+    that it doesn't expand `box_targets`.
+
+    The box loss is smooth L1-loss on only positive samples of RoIs.
+    Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py  # pylint: disable=line-too-long
+
+    Args:
+      box_outputs: a float tensor representing the box prediction for each box
+        with a shape of [batch_size, num_boxes, num_classes * 4].
+      class_targets: a float tensor representing the class label for each box
+        with a shape of [batch_size, num_boxes].
+      box_targets: a float tensor representing the box label for each box
+        with a shape of [batch_size, num_boxes, 4].
+
+    Returns:
+      box_loss: a scalar tensor representing total box regression loss.
+    """
+    with tf.name_scope('fast_rcnn_loss'):
+      class_targets = tf.cast(class_targets, dtype=tf.int32)
+
+      # Selects the box from `box_outputs` based on `class_targets`, with which
+      # the box has the maximum overlap.
+      (batch_size, num_rois,
+       num_class_specific_boxes) = box_outputs.get_shape().as_list()
+      num_classes = num_class_specific_boxes // 4
+      box_outputs = tf.reshape(box_outputs,
+                               [batch_size, num_rois, num_classes, 4])
+
+      box_indices = tf.reshape(
+          class_targets + tf.tile(
+              tf.expand_dims(
+                  tf.range(batch_size) * num_rois * num_classes, 1),
+              [1, num_rois]) + tf.tile(
+                  tf.expand_dims(tf.range(num_rois) * num_classes, 0),
+                  [batch_size, 1]), [-1])
+
+      box_outputs = tf.matmul(
+          tf.one_hot(
+              box_indices,
+              batch_size * num_rois * num_classes,
+              dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
+      box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])
+
+      return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets)
+
+  def _fast_rcnn_box_loss(self, box_outputs, box_targets, class_targets,
+                          normalizer=1.0):
+    """Computes box regression loss."""
+    with tf.name_scope('fast_rcnn_box_loss'):
+      mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
+                     [1, 1, 4])
+      mask = tf.cast(mask, dtype=tf.float32)
+      box_targets = tf.expand_dims(box_targets, axis=-1)
+      box_outputs = tf.expand_dims(box_outputs, axis=-1)
+      box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
+      # The loss is normalized by the number of ones in mask,
+      # additianal normalizer provided by the user and using 0.01 here to avoid
+      # division by 0.
+      box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01)
+      return box_loss
+
+
+class MaskrcnnLoss(object):
+  """Mask R-CNN instance segmentation mask loss function."""
+
+  def __init__(self):
+    self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy(
+        reduction=tf.keras.losses.Reduction.SUM, from_logits=True)
+
+  def __call__(self, mask_outputs, mask_targets, select_class_targets):
+    """Computes the mask loss of Mask-RCNN.
+
+    This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
+    produces `num_classes` masks for each RoI, the reference model expands
+    `mask_targets` to match the shape of `mask_outputs` and selects only the
+    target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py)  # pylint: disable=line-too-long
+    Instead, this implementation selects the `mask_outputs` by the `class_targets`
+    so that it doesn't expand `mask_targets`. Note that the selection logic is
+    done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py.
+
+    Args:
+      mask_outputs: a float tensor representing the prediction for each mask,
+        with a shape of
+        [batch_size, num_masks, mask_height, mask_width].
+      mask_targets: a float tensor representing the binary mask of ground truth
+        labels for each mask with a shape of
+        [batch_size, num_masks, mask_height, mask_width].
+      select_class_targets: a tensor with a shape of [batch_size, num_masks],
+        representing the foreground mask targets.
+
+    Returns:
+      mask_loss: a float tensor representing total mask loss.
+    """
+    with tf.name_scope('mask_rcnn_loss'):
+      (batch_size, num_masks, mask_height,
+       mask_width) = mask_outputs.get_shape().as_list()
+
+      weights = tf.tile(
+          tf.reshape(tf.greater(select_class_targets, 0),
+                     [batch_size, num_masks, 1, 1]),
+          [1, 1, mask_height, mask_width])
+      weights = tf.cast(weights, dtype=tf.float32)
+
+      mask_targets = tf.expand_dims(mask_targets, axis=-1)
+      mask_outputs = tf.expand_dims(mask_outputs, axis=-1)
+      mask_loss = self._binary_crossentropy(mask_targets, mask_outputs,
+                                            sample_weight=weights)
+
+      # The loss is normalized by the number of 1's in weights and
+      # + 0.01 is used to avoid division by zero.
+      return mask_loss / (tf.reduce_sum(weights) + 0.01)
+
--- a/official/vision/beta/losses/retinanet_losses.py
+++ b/official/vision/beta/losses/retinanet_losses.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Losses used for detection models."""
+
+# Import libraries
+import tensorflow as tf
+
+
+def focal_loss(logits, targets, alpha, gamma):
+  """Compute the focal loss between `logits` and the golden `target` values.
+
+  Focal loss = -(1-pt)^gamma * log(pt)
+  where pt is the probability of being classified to the true class.
+
+  Args:
+    logits: A float32 tensor of size
+      [batch, d_1, ..., d_k, n_classes].
+    targets: A float32 tensor of size
+      [batch, d_1, ..., d_k, n_classes].
+    alpha: A float32 scalar multiplying alpha to the loss from positive examples
+      and (1-alpha) to the loss from negative examples.
+    gamma: A float32 scalar modulating loss from hard and easy examples.
+
+  Returns:
+    loss: A float32 Tensor of size
+      [batch, d_1, ..., d_k, n_classes] representing
+      normalized loss on the prediction map.
+  """
+  with tf.name_scope('focal_loss'):
+    positive_label_mask = tf.equal(targets, 1.0)
+    cross_entropy = (
+        tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits))
+    probs = tf.sigmoid(logits)
+    probs_gt = tf.where(positive_label_mask, probs, 1.0 - probs)
+    # With small gamma, the implementation could produce NaN during back prop.
+    modulator = tf.pow(1.0 - probs_gt, gamma)
+    loss = modulator * cross_entropy
+    weighted_loss = tf.where(positive_label_mask, alpha * loss,
+                             (1.0 - alpha) * loss)
+
+  return weighted_loss
+
+
+class FocalLoss(tf.keras.losses.Loss):
+  """Implements a Focal loss for classification problems.
+
+  Reference:
+    [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002).
+  """
+
+  def __init__(self,
+               alpha,
+               gamma,
+               num_classes,
+               reduction=tf.keras.losses.Reduction.AUTO,
+               name=None):
+    """Initializes `FocalLoss`.
+
+    Arguments:
+      alpha: The `alpha` weight factor for binary class imbalance.
+      gamma: The `gamma` focusing parameter to re-weight loss.
+      num_classes: Number of foreground classes.
+      reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
+        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
+        option will be determined by the usage context. For almost all cases
+        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
+        `tf.distribute.Strategy`, outside of built-in training loops such as
+        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
+            more details.
+      name: Optional name for the op. Defaults to 'retinanet_class_loss'.
+    """
+    self._num_classes = num_classes
+    self._alpha = alpha
+    self._gamma = gamma
+    super(FocalLoss, self).__init__(reduction=reduction, name=name)
+
+  def call(self, y_true, y_pred):
+    """Invokes the `FocalLoss`.
+
+    Arguments:
+      y_true: Ordered Dict with level to [batch, height, width, num_anchors].
+        for example,
+        {3: tf.Tensor(shape=[32, 512, 512, 9], dtype=tf.float32),
+         4: tf.Tensor([shape=32, 256, 256, 9, dtype=tf.float32])}
+      y_pred: Ordered Dict with level to [batch, height, width, num_anchors *
+        num_classes]. for example,
+        {3: tf.Tensor(shape=[32, 512, 512, 9], dtype=tf.int64),
+         4: tf.Tensor(shape=[32, 256, 256, 9 * 21], dtype=tf.int64)}
+
+    Returns:
+      Summed loss float `Tensor`.
+    """
+    flattened_cls_outputs = []
+    flattened_labels = []
+    batch_size = None
+    for level in y_pred.keys():
+      cls_output = y_pred[level]
+      label = y_true[level]
+      if batch_size is None:
+        batch_size = cls_output.shape[0] or tf.shape(cls_output)[0]
+      flattened_cls_outputs.append(
+          tf.reshape(cls_output, [batch_size, -1, self._num_classes]))
+      flattened_labels.append(tf.reshape(label, [batch_size, -1]))
+    cls_outputs = tf.concat(flattened_cls_outputs, axis=1)
+    labels = tf.concat(flattened_labels, axis=1)
+
+    cls_targets_one_hot = tf.one_hot(labels, self._num_classes)
+    return focal_loss(
+        tf.cast(cls_outputs, dtype=tf.float32),
+        tf.cast(cls_targets_one_hot, dtype=tf.float32), self._alpha,
+        self._gamma)
+
+  def get_config(self):
+    config = {
+        'alpha': self._alpha,
+        'gamma': self._gamma,
+        'num_classes': self._num_classes,
+    }
+    base_config = super(FocalLoss, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class RetinanetBoxLoss(tf.keras.losses.Loss):
+  """RetinaNet box Huber loss."""
+
+  def __init__(self,
+               delta,
+               reduction=tf.keras.losses.Reduction.AUTO,
+               name=None):
+    """Initializes `RetinanetBoxLoss`.
+
+    Arguments:
+      delta: A float, the point where the Huber loss function changes from a
+        quadratic to linear.
+      reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
+        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
+        option will be determined by the usage context. For almost all cases
+        this defaults to `SUM_OVER_BATCH_SIZE`. When used with
+        `tf.distribute.Strategy`, outside of built-in training loops such as
+        `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
+        will raise an error. Please see this custom training [tutorial](
+          https://www.tensorflow.org/tutorials/distribute/custom_training) for
+            more details.
+      name: Optional name for the op. Defaults to 'retinanet_class_loss'.
+    """
+    self._huber_loss = tf.keras.losses.Huber(
+        delta=delta, reduction=tf.keras.losses.Reduction.NONE)
+    self._delta = delta
+    super(RetinanetBoxLoss, self).__init__(reduction=reduction, name=name)
+
+  def call(self, y_true, y_pred):
+    """Computes box detection loss.
+
+    Computes total detection loss including box and class loss from all levels.
+
+    Arguments:
+      y_true: Ordered Dict with level to [batch, height, width,
+        num_anchors * 4] for example,
+        {3: tf.Tensor(shape=[32, 512, 512, 9 * 4], dtype=tf.float32),
+         4: tf.Tensor([shape=32, 256, 256, 9 * 4, dtype=tf.float32])}
+      y_pred: Ordered Dict with level to [batch, height, width,
+        num_anchors * 4]. for example,
+        {3: tf.Tensor(shape=[32, 512, 512, 9 * 4], dtype=tf.int64),
+         4: tf.Tensor(shape=[32, 256, 256, 9 * 4], dtype=tf.int64)}
+
+    Returns:
+      an integer tensor representing total box regression loss.
+    """
+    # Sums all positives in a batch for normalization and avoids zero
+    # num_positives_sum, which would lead to inf loss during training
+
+    flattened_box_outputs = []
+    flattened_labels = []
+    batch_size = None
+    for level in y_pred.keys():
+      box_output = y_pred[level]
+      label = y_true[level]
+      if batch_size is None:
+        batch_size = box_output.shape[0] or tf.shape(box_output)[0]
+      flattened_box_outputs.append(tf.reshape(box_output, [batch_size, -1, 4]))
+      flattened_labels.append(tf.reshape(label, [batch_size, -1, 4]))
+    box_outputs = tf.concat(flattened_box_outputs, axis=1)
+    labels = tf.concat(flattened_labels, axis=1)
+    loss = self._huber_loss(labels, box_outputs)
+    return loss
+
+  def get_config(self):
+    config = {
+        'delta': self._delta,
+    }
+    base_config = super(RetinanetBoxLoss, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
--- a/official/vision/beta/modeling/backbones/__init__.py
+++ b/official/vision/beta/modeling/backbones/__init__.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Backbones package definition."""
+
+from official.vision.beta.modeling.backbones.efficientnet import EfficientNet
+from official.vision.beta.modeling.backbones.resnet import ResNet
+from official.vision.beta.modeling.backbones.resnet_3d import ResNet3D
+from official.vision.beta.modeling.backbones.revnet import RevNet
+from official.vision.beta.modeling.backbones.spinenet import SpineNet
--- a/official/vision/beta/modeling/backbones/efficientnet.py
+++ b/official/vision/beta/modeling/backbones/efficientnet.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains definitions of EfficientNet Networks."""
+
+import math
+# Import libraries
+from absl import logging
+import tensorflow as tf
+from official.modeling import tf_utils
+from official.vision.beta.modeling.layers import nn_blocks
+
+layers = tf.keras.layers
+
+# The fixed EfficientNet-B0 architecture discovered by NAS.
+# Each element represents a specification of a building block:
+# (block_fn, block_repeats, kernel_size, strides, expand_ratio, in_filters,
+# out_filters, is_output)
+EN_B0_BLOCK_SPECS = [
+    ('mbconv', 1, 3, 1, 1, 32, 16, False),
+    ('mbconv', 2, 3, 2, 6, 16, 24, True),
+    ('mbconv', 2, 5, 2, 6, 24, 40, True),
+    ('mbconv', 3, 3, 2, 6, 40, 80, False),
+    ('mbconv', 3, 5, 1, 6, 80, 112, True),
+    ('mbconv', 4, 5, 2, 6, 112, 192, False),
+    ('mbconv', 1, 3, 1, 6, 192, 320, True),
+]
+
+SCALING_MAP = {
+    'b0': dict(width_scale=1.0, depth_scale=1.0),
+    'b1': dict(width_scale=1.0, depth_scale=1.1),
+    'b2': dict(width_scale=1.1, depth_scale=1.2),
+    'b3': dict(width_scale=1.2, depth_scale=1.4),
+    'b4': dict(width_scale=1.4, depth_scale=1.8),
+    'b5': dict(width_scale=1.6, depth_scale=2.2),
+    'b6': dict(width_scale=1.8, depth_scale=2.6),
+    'b7': dict(width_scale=2.0, depth_scale=3.1),
+}
+
+
+def round_filters(filters, multiplier, divisor=8, min_depth=None, skip=False):
+  """Round number of filters based on depth multiplier."""
+  orig_f = filters
+  if skip or not multiplier:
+    return filters
+
+  filters *= multiplier
+  min_depth = min_depth or divisor
+  new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_filters < 0.9 * filters:
+    new_filters += divisor
+  logging.info('round_filter input=%s output=%s', orig_f, new_filters)
+  return int(new_filters)
+
+
+def round_repeats(repeats, multiplier, skip=False):
+  """Round number of filters based on depth multiplier."""
+  if skip or not multiplier:
+    return repeats
+  return int(math.ceil(multiplier * repeats))
+
+
+def block_spec_decoder(specs, width_scale, depth_scale):
+  """Decode specs for a block."""
+  decoded_specs = []
+  for s in specs:
+    s = s + (
+        width_scale,
+        depth_scale,
+    )
+    decoded_specs.append(BlockSpec(*s))
+  return decoded_specs
+
+
+class BlockSpec(object):
+  """A container class that specifies the block configuration for MnasNet."""
+
+  def __init__(self, block_fn, block_repeats, kernel_size, strides,
+               expand_ratio, in_filters, out_filters, is_output, width_scale,
+               depth_scale):
+    self.block_fn = block_fn
+    self.block_repeats = round_repeats(block_repeats, depth_scale)
+    self.kernel_size = kernel_size
+    self.strides = strides
+    self.expand_ratio = expand_ratio
+    self.in_filters = round_filters(in_filters, width_scale)
+    self.out_filters = round_filters(out_filters, width_scale)
+    self.is_output = is_output
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class EfficientNet(tf.keras.Model):
+  """Class to build EfficientNet family model."""
+
+  def __init__(self,
+               model_id,
+               input_specs=layers.InputSpec(shape=[None, None, None, 3]),
+               se_ratio=0.0,
+               stochastic_depth_drop_rate=0.0,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """EfficientNet initialization function.
+
+    Args:
+      model_id: `str` model id of EfficientNet.
+      input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+      se_ratio: `float` squeeze and excitation ratio for inverted bottleneck
+        blocks.
+      stochastic_depth_drop_rate: `float` drop rate for drop connect layer.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    self._model_id = model_id
+    self._input_specs = input_specs
+    self._se_ratio = se_ratio
+    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if use_sync_bn:
+      self._norm = layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      bn_axis = -1
+    else:
+      bn_axis = 1
+
+    # Build EfficientNet.
+    inputs = tf.keras.Input(shape=input_specs.shape[1:])
+    width_scale = SCALING_MAP[model_id]['width_scale']
+    depth_scale = SCALING_MAP[model_id]['depth_scale']
+
+    # Build stem.
+    x = layers.Conv2D(
+        filters=round_filters(32, width_scale),
+        kernel_size=3,
+        strides=2,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            inputs)
+    x = self._norm(
+        axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)(
+            x)
+    x = tf_utils.get_activation(activation)(x)
+
+    # Build intermediate blocks.
+    endpoints = {}
+    endpoint_level = 2
+    decoded_specs = block_spec_decoder(EN_B0_BLOCK_SPECS, width_scale,
+                                       depth_scale)
+
+    for i, specs in enumerate(decoded_specs):
+      x = self._block_group(
+          inputs=x, specs=specs, name='block_group_{}'.format(i))
+      if specs.is_output:
+        endpoints[endpoint_level] = x
+        endpoint_level += 1
+
+    # Build output specs for downstream tasks.
+    self._output_specs = {l: endpoints[l].get_shape for l in endpoints.keys()}
+
+    # Build the final conv for classification.
+    x = layers.Conv2D(
+        filters=round_filters(1280, width_scale),
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            x)
+    x = self._norm(
+        axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)(
+            x)
+    endpoints[endpoint_level] = tf_utils.get_activation(activation)(x)
+
+    super(EfficientNet, self).__init__(
+        inputs=inputs, outputs=endpoints, **kwargs)
+
+  def _block_group(self, inputs, specs, name='block_group'):
+    """Creates one group of blocks for the EfficientNet model.
+
+    Args:
+      inputs: `Tensor` of size `[batch, channels, height, width]`.
+      specs: specifications for one inverted bottleneck block group.
+      name: `str`name for the block.
+
+    Returns:
+      The output `Tensor` of the block layer.
+    """
+    if specs.block_fn == 'mbconv':
+      block_fn = nn_blocks.InvertedBottleneckBlock
+    else:
+      raise ValueError('Block func {} not supported.'.format(specs.block_fn))
+
+    x = block_fn(
+        in_filters=specs.in_filters,
+        out_filters=specs.out_filters,
+        expand_ratio=specs.expand_ratio,
+        strides=specs.strides,
+        kernel_size=specs.kernel_size,
+        se_ratio=self._se_ratio,
+        stochastic_depth_drop_rate=self._stochastic_depth_drop_rate,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        norm_momentum=self._norm_momentum,
+        norm_epsilon=self._norm_epsilon)(
+            inputs)
+
+    for _ in range(1, specs.block_repeats):
+      x = block_fn(
+          in_filters=specs.out_filters,  # Set 'in_filters' to 'out_filters'.
+          out_filters=specs.out_filters,
+          expand_ratio=specs.expand_ratio,
+          strides=1,  # Fix strides to 1.
+          kernel_size=specs.kernel_size,
+          se_ratio=self._se_ratio,
+          stochastic_depth_drop_rate=self._stochastic_depth_drop_rate,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          activation=self._activation,
+          use_sync_bn=self._use_sync_bn,
+          norm_momentum=self._norm_momentum,
+          norm_epsilon=self._norm_epsilon)(
+              x)
+
+    return tf.identity(x, name=name)
+
+  def get_config(self):
+    config_dict = {
+        'model_id': self._model_id,
+        'se_ratio': self._se_ratio,
+        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    return config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def output_specs(self):
+    """A dict of {level: TensorShape} pairs for the model output."""
+    return self._output_specs
--- a/official/vision/beta/modeling/backbones/efficientnet_test.py
+++ b/official/vision/beta/modeling/backbones/efficientnet_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for EfficientNet."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.beta.modeling.backbones import efficientnet
+
+
+class EfficientNetTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(32, 224)
+  def test_network_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = efficientnet.EfficientNet(model_id='b0')
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2**2, input_size / 2**2, 24],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2**3, input_size / 2**3, 40],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2**4, input_size / 2**4, 112],
+                        endpoints[4].shape.as_list())
+    self.assertAllEqual([1, input_size / 2**5, input_size / 2**5, 320],
+                        endpoints[5].shape.as_list())
+
+  @parameterized.parameters('b0', 'b3', 'b6')
+  def test_network_scaling(self, model_id):
+    """Test compound scaling."""
+    efficientnet_params = {
+        'b0': 4049564,
+        'b3': 10783528,
+        'b6': 40960136,
+    }
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    input_size = 32
+    network = efficientnet.EfficientNet(model_id=model_id, se_ratio=0.25)
+    self.assertEqual(network.count_params(), efficientnet_params[model_id])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1, 3)
+  def test_input_specs(self, input_dim):
+    """Test different input feature dimensions."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim])
+    network = efficientnet.EfficientNet(model_id='b0', input_specs=input_specs)
+
+    inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1)
+    _ = network(inputs)
+
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        model_id='b0',
+        se_ratio=0.25,
+        stochastic_depth_drop_rate=None,
+        use_sync_bn=False,
+        kernel_initializer='VarianceScaling',
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activation='relu',
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+    )
+    network = efficientnet.EfficientNet(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(network.get_config(), expected_config)
+
+    # Create another network object from the first object's config.
+    new_network = efficientnet.EfficientNet.from_config(network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = new_network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/backbones/factory.py
+++ b/official/vision/beta/modeling/backbones/factory.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""factory method."""
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.modeling import backbones
+from official.vision.beta.modeling.backbones import spinenet
+
+
+def build_backbone(input_specs: tf.keras.layers.InputSpec,
+                   model_config,
+                   l2_regularizer: tf.keras.regularizers.Regularizer = None):
+  """Builds backbone from a config.
+
+  Args:
+    input_specs: tf.keras.layers.InputSpec.
+    model_config: a OneOfConfig. Model config.
+    l2_regularizer: tf.keras.regularizers.Regularizer instance. Default to None.
+
+  Returns:
+    tf.keras.Model instance of the backbone.
+  """
+  backbone_type = model_config.backbone.type
+  backbone_cfg = model_config.backbone.get()
+  norm_activation_config = model_config.norm_activation
+
+  if backbone_type == 'resnet':
+    backbone = backbones.ResNet(
+        model_id=backbone_cfg.model_id,
+        input_specs=input_specs,
+        activation=norm_activation_config.activation,
+        use_sync_bn=norm_activation_config.use_sync_bn,
+        norm_momentum=norm_activation_config.norm_momentum,
+        norm_epsilon=norm_activation_config.norm_epsilon,
+        kernel_regularizer=l2_regularizer)
+  elif backbone_type == 'efficientnet':
+    backbone = backbones.EfficientNet(
+        model_id=backbone_cfg.model_id,
+        input_specs=input_specs,
+        stochastic_depth_drop_rate=backbone_cfg.stochastic_depth_drop_rate,
+        se_ratio=backbone_cfg.se_ratio,
+        activation=norm_activation_config.activation,
+        use_sync_bn=norm_activation_config.use_sync_bn,
+        norm_momentum=norm_activation_config.norm_momentum,
+        norm_epsilon=norm_activation_config.norm_epsilon,
+        kernel_regularizer=l2_regularizer)
+  elif backbone_type == 'spinenet':
+    model_id = backbone_cfg.model_id
+    if model_id not in spinenet.SCALING_MAP:
+      raise ValueError(
+          'SpineNet-{} is not a valid architecture.'.format(model_id))
+    scaling_params = spinenet.SCALING_MAP[model_id]
+
+    backbone = backbones.SpineNet(
+        input_specs=input_specs,
+        min_level=model_config.min_level,
+        max_level=model_config.max_level,
+        endpoints_num_filters=scaling_params['endpoints_num_filters'],
+        resample_alpha=scaling_params['resample_alpha'],
+        block_repeats=scaling_params['block_repeats'],
+        filter_size_scale=scaling_params['filter_size_scale'],
+        kernel_regularizer=l2_regularizer,
+        activation=norm_activation_config.activation,
+        use_sync_bn=norm_activation_config.use_sync_bn,
+        norm_momentum=norm_activation_config.norm_momentum,
+        norm_epsilon=norm_activation_config.norm_epsilon)
+  elif backbone_type == 'revnet':
+    backbone = backbones.RevNet(
+        model_id=backbone_cfg.model_id,
+        input_specs=input_specs,
+        activation=norm_activation_config.activation,
+        use_sync_bn=norm_activation_config.use_sync_bn,
+        norm_momentum=norm_activation_config.norm_momentum,
+        norm_epsilon=norm_activation_config.norm_epsilon,
+        kernel_regularizer=l2_regularizer)
+  else:
+    raise ValueError('Backbone {!r} not implement'.format(backbone_type))
+
+  return backbone
+
+
+def build_backbone_3d(input_specs: tf.keras.layers.InputSpec,
+                      model_config,
+                      l2_regularizer: tf.keras.regularizers.Regularizer = None):
+  """Builds 3d backbone from a config.
+
+  Args:
+    input_specs: tf.keras.layers.InputSpec.
+    model_config: a OneOfConfig. Model config.
+    l2_regularizer: tf.keras.regularizers.Regularizer instance. Default to None.
+
+  Returns:
+    tf.keras.Model instance of the backbone.
+  """
+  backbone_type = model_config.backbone.type
+  backbone_cfg = model_config.backbone.get()
+  norm_activation_config = model_config.norm_activation
+
+  # Flatten configs before passing to the backbone.
+  temporal_strides = []
+  temporal_kernel_sizes = []
+  use_self_gating = []
+  for block_spec in backbone_cfg.block_specs:
+    temporal_strides.append(block_spec.temporal_strides)
+    temporal_kernel_sizes.append(block_spec.temporal_kernel_sizes)
+    use_self_gating.append(block_spec.use_self_gating)
+
+  if backbone_type == 'resnet_3d':
+    backbone = backbones.ResNet3D(
+        model_id=backbone_cfg.model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes,
+        use_self_gating=use_self_gating,
+        input_specs=input_specs,
+        activation=norm_activation_config.activation,
+        use_sync_bn=norm_activation_config.use_sync_bn,
+        norm_momentum=norm_activation_config.norm_momentum,
+        norm_epsilon=norm_activation_config.norm_epsilon,
+        kernel_regularizer=l2_regularizer)
+  else:
+    raise ValueError('Backbone {!r} not implement'.format(backbone_type))
+
+  return backbone
--- a/official/vision/beta/modeling/backbones/factory_test.py
+++ b/official/vision/beta/modeling/backbones/factory_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for factory functions."""
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from official.vision.beta.configs import backbones as backbones_cfg
+from official.vision.beta.configs import backbones_3d as backbones_3d_cfg
+from official.vision.beta.configs import common as common_cfg
+from official.vision.beta.configs import retinanet as retinanet_cfg
+from official.vision.beta.modeling import backbones
+from official.vision.beta.modeling.backbones import factory
+
+
+class FactoryTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(model_id=[18, 34, 50, 101, 152],))
+  def test_resnet_creation(self, model_id):
+    """Test creation of ResNet models."""
+
+    network = backbones.ResNet(
+        model_id=model_id, norm_momentum=0.99, norm_epsilon=1e-5)
+
+    backbone_config = backbones_cfg.Backbone(
+        type='resnet',
+        resnet=backbones_cfg.ResNet(model_id=model_id))
+    norm_activation_config = common_cfg.NormActivation(
+        norm_momentum=0.99, norm_epsilon=1e-5)
+    model_config = retinanet_cfg.RetinaNet(
+        backbone=backbone_config, norm_activation=norm_activation_config)
+
+    factory_network = factory.build_backbone(
+        input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
+        model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  @combinations.generate(
+      combinations.combine(
+          model_id=['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
+          se_ratio=[0.0, 0.25],
+      ))
+  def test_efficientnet_creation(self, model_id, se_ratio):
+    """Test creation of EfficientNet models."""
+
+    network = backbones.EfficientNet(
+        model_id=model_id,
+        se_ratio=se_ratio,
+        norm_momentum=0.99,
+        norm_epsilon=1e-5)
+
+    backbone_config = backbones_cfg.Backbone(
+        type='efficientnet',
+        efficientnet=backbones_cfg.EfficientNet(
+            model_id=model_id, se_ratio=se_ratio))
+    norm_activation_config = common_cfg.NormActivation(
+        norm_momentum=0.99, norm_epsilon=1e-5)
+    model_config = retinanet_cfg.RetinaNet(
+        backbone=backbone_config, norm_activation=norm_activation_config)
+
+    factory_network = factory.build_backbone(
+        input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
+        model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  @combinations.generate(combinations.combine(model_id=['49'],))
+  def test_spinenet_creation(self, model_id):
+    """Test creation of SpineNet models."""
+    input_size = 128
+    min_level = 3
+    max_level = 7
+
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size, input_size, 3])
+    network = backbones.SpineNet(
+        input_specs=input_specs,
+        min_level=min_level,
+        max_level=max_level,
+        norm_momentum=0.99,
+        norm_epsilon=1e-5)
+
+    backbone_config = backbones_cfg.Backbone(
+        type='spinenet',
+        spinenet=backbones_cfg.SpineNet(model_id=model_id))
+    norm_activation_config = common_cfg.NormActivation(
+        norm_momentum=0.99, norm_epsilon=1e-5)
+    model_config = retinanet_cfg.RetinaNet(
+        backbone=backbone_config, norm_activation=norm_activation_config)
+
+    factory_network = factory.build_backbone(
+        input_specs=tf.keras.layers.InputSpec(
+            shape=[None, input_size, input_size, 3]),
+        model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  @combinations.generate(
+      combinations.combine(model_id=[38, 56, 104],))
+  def test_revnet_creation(self, model_id):
+    """Test creation of RevNet models."""
+    network = backbones.RevNet(
+        model_id=model_id, norm_momentum=0.99, norm_epsilon=1e-5)
+
+    backbone_config = backbones_cfg.Backbone(
+        type='revnet',
+        revnet=backbones_cfg.RevNet(model_id=model_id))
+    norm_activation_config = common_cfg.NormActivation(
+        norm_momentum=0.99, norm_epsilon=1e-5)
+    model_config = retinanet_cfg.RetinaNet(
+        backbone=backbone_config, norm_activation=norm_activation_config)
+
+    factory_network = factory.build_backbone(
+        input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
+        model_config=model_config)
+
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+
+    self.assertEqual(network_config, factory_network_config)
+
+  @combinations.generate(combinations.combine(model_type=['resnet_3d'],))
+  def test_resnet_3d_creation(self, model_type):
+    """Test creation of ResNet 3D models."""
+    backbone_cfg = backbones_3d_cfg.Backbone3D(type=model_type).get()
+    temporal_strides = []
+    temporal_kernel_sizes = []
+    for block_spec in backbone_cfg.block_specs:
+      temporal_strides.append(block_spec.temporal_strides)
+      temporal_kernel_sizes.append(block_spec.temporal_kernel_sizes)
+
+    _ = backbones.ResNet3D(
+        model_id=backbone_cfg.model_id,
+        temporal_strides=temporal_strides,
+        temporal_kernel_sizes=temporal_kernel_sizes,
+        norm_momentum=0.99,
+        norm_epsilon=1e-5)
+
+
+if __name__ == '__main__':
+  tf.test.main()