Internal change

PiperOrigin-RevId: 329754787

Internal change
PiperOrigin-RevId: 329754787
cc748b2a · Abdullah Rashwan · A. Unique TensorFlower · 2f788e1d · cc748b2a · cc748b2a
Commit cc748b2a authored Sep 02, 2020 by Abdullah Rashwan Committed by A. Unique TensorFlower Sep 02, 2020
20 changed files
--- a/official/vision/beta/dataloaders/parser.py
+++ b/official/vision/beta/dataloaders/parser.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The generic parser interface."""
+
+import abc
+
+
+class Parser(object):
+  """Parses data and produces tensors to be consumed by models."""
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def _parse_train_data(self, decoded_tensors):
+    """Generates images and labels that are usable for model training.
+
+    Args:
+      decoded_tensors: a dict of Tensors produced by the decoder.
+
+    Returns:
+      images: the image tensor.
+      labels: a dict of Tensors that contains labels.
+    """
+    pass
+
+  @abc.abstractmethod
+  def _parse_eval_data(self, decoded_tensors):
+    """Generates images and labels that are usable for model evaluation.
+
+    Args:
+      decoded_tensors: a dict of Tensors produced by the decoder.
+
+    Returns:
+      images: the image tensor.
+      labels: a dict of Tensors that contains labels.
+    """
+    pass
+
+  def parse_fn(self, is_training):
+    """Returns a parse fn that reads and parses raw tensors from the decoder.
+
+    Args:
+      is_training: a `bool` to indicate whether it is in training mode.
+
+    Returns:
+      parse: a `callable` that takes the serialized examle and generate the
+        images, labels tuple where labels is a dict of Tensors that contains
+        labels.
+    """
+    def parse(decoded_tensors):
+      """Parses the serialized example data."""
+      if is_training:
+        return self._parse_train_data(decoded_tensors)
+      else:
+        return self._parse_eval_data(decoded_tensors)
+
+    return parse
--- a/official/vision/beta/dataloaders/retinanet_input.py
+++ b/official/vision/beta/dataloaders/retinanet_input.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data parser and processing for RetinaNet.
+
+Parse image and ground truths in a dataset to training targets and package them
+into (image, labels) tuple for RetinaNet.
+"""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import parser
+from official.vision.beta.dataloaders import utils
+from official.vision.beta.ops import anchor
+from official.vision.beta.ops import box_ops
+from official.vision.beta.ops import preprocess_ops
+
+
+class Parser(parser.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+
+  def __init__(self,
+               output_size,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               match_threshold=0.5,
+               unmatched_threshold=0.5,
+               aug_rand_hflip=False,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               use_autoaugment=False,
+               autoaugment_policy_name='v0',
+               skip_crowd_during_training=True,
+               max_num_instances=100,
+               dtype='bfloat16',
+               mode=None):
+    """Initializes parameters for parsing annotations in the dataset.
+
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      min_level: `int` number of minimum level of the output feature pyramid.
+      max_level: `int` number of maximum level of the output feature pyramid.
+      num_scales: `int` number representing intermediate scales added on each
+        level. For instances, num_scales=2 adds one additional intermediate
+        anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: `list` of float numbers representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: `float` number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      match_threshold: `float` number between 0 and 1 representing the
+        lower-bound threshold to assign positive labels for anchors. An anchor
+        with a score over the threshold is labeled positive.
+      unmatched_threshold: `float` number between 0 and 1 representing the
+        upper-bound threshold to assign negative labels for anchors. An anchor
+        with a score below the threshold is labeled negative.
+      aug_rand_hflip: `bool`, if True, augment training with random horizontal
+        flip.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy
+        during training.
+      autoaugment_policy_name: `string` that specifies the name of the
+        AutoAugment policy that will be used during training.
+      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
+        `is_crowd` equals to 1.
+      max_num_instances: `int` number of maximum number of instances in an
+        image. The groundtruth data will be padded to `max_num_instances`.
+      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
+      mode: a ModeKeys. Specifies if this is training, evaluation, prediction or
+        prediction with groundtruths in the outputs.
+    """
+    self._mode = mode
+    self._max_num_instances = max_num_instances
+    self._skip_crowd_during_training = skip_crowd_during_training
+
+    # Anchor.
+    self._output_size = output_size
+    self._min_level = min_level
+    self._max_level = max_level
+    self._num_scales = num_scales
+    self._aspect_ratios = aspect_ratios
+    self._anchor_size = anchor_size
+    self._match_threshold = match_threshold
+    self._unmatched_threshold = unmatched_threshold
+
+    # Data augmentation.
+    self._aug_rand_hflip = aug_rand_hflip
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+
+    # Data Augmentation with AutoAugment.
+    self._use_autoaugment = use_autoaugment
+    self._autoaugment_policy_name = autoaugment_policy_name
+
+    # Device.
+    self._use_bfloat16 = True if dtype == 'bfloat16' else False
+
+  def _parse_train_data(self, data):
+    """Parses data for training and evaluation."""
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+    is_crowds = data['groundtruth_is_crowd']
+    # Skips annotations with `is_crowd` = True.
+    if self._skip_crowd_during_training:
+      num_groundtrtuhs = tf.shape(input=classes)[0]
+      with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
+        indices = tf.cond(
+            pred=tf.greater(tf.size(input=is_crowds), 0),
+            true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
+            false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
+      classes = tf.gather(classes, indices)
+      boxes = tf.gather(boxes, indices)
+
+    # Gets original image and its size.
+    image = data['image']
+
+    image_shape = tf.shape(input=image)[0:2]
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image)
+
+    # Flips image randomly during training.
+    if self._aug_rand_hflip:
+      image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)
+
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    boxes = box_ops.denormalize_boxes(boxes, image_shape)
+
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=preprocess_ops.compute_padded_size(self._output_size,
+                                                       2**self._max_level),
+        aug_scale_min=self._aug_scale_min,
+        aug_scale_max=self._aug_scale_max)
+    image_height, image_width, _ = image.get_shape().as_list()
+
+    # Resizes and crops boxes.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
+                                                 image_info[1, :], offset)
+    # Filters out ground truth boxes that are all zeros.
+    indices = box_ops.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+
+    # Assigns anchors.
+    input_anchor = anchor.build_anchor_generator(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        num_scales=self._num_scales,
+        aspect_ratios=self._aspect_ratios,
+        anchor_size=self._anchor_size)
+    anchor_boxes = input_anchor(image_size=(image_height, image_width))
+    anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
+                                          self._unmatched_threshold)
+    (cls_targets, box_targets, cls_weights,
+     box_weights) = anchor_labeler.label_anchors(
+         anchor_boxes, boxes,
+         tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
+
+    # If bfloat16 is used, casts input image to tf.bfloat16.
+    if self._use_bfloat16:
+      image = tf.cast(image, dtype=tf.bfloat16)
+
+    # Packs labels for model_fn outputs.
+    labels = {
+        'cls_targets': cls_targets,
+        'box_targets': box_targets,
+        'anchor_boxes': anchor_boxes,
+        'cls_weights': cls_weights,
+        'box_weights': box_weights,
+        'image_info': image_info,
+    }
+    return image, labels
+
+  def _parse_eval_data(self, data):
+    """Parses data for training and evaluation."""
+    groundtruths = {}
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+
+    # Gets original image and its size.
+    image = data['image']
+    image_shape = tf.shape(input=image)[0:2]
+
+    # Normalizes image with mean and std pixel values.
+    image = preprocess_ops.normalize_image(image)
+
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    boxes = box_ops.denormalize_boxes(boxes, image_shape)
+
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=preprocess_ops.compute_padded_size(self._output_size,
+                                                       2**self._max_level),
+        aug_scale_min=1.0,
+        aug_scale_max=1.0)
+    image_height, image_width, _ = image.get_shape().as_list()
+
+    # Resizes and crops boxes.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale,
+                                                 image_info[1, :], offset)
+    # Filters out ground truth boxes that are all zeros.
+    indices = box_ops.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+
+    # Assigns anchors.
+    input_anchor = anchor.build_anchor_generator(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        num_scales=self._num_scales,
+        aspect_ratios=self._aspect_ratios,
+        anchor_size=self._anchor_size)
+    anchor_boxes = input_anchor(image_size=(image_height, image_width))
+    anchor_labeler = anchor.AnchorLabeler(self._match_threshold,
+                                          self._unmatched_threshold)
+    (cls_targets, box_targets, cls_weights,
+     box_weights) = anchor_labeler.label_anchors(
+         anchor_boxes, boxes,
+         tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
+
+    # If bfloat16 is used, casts input image to tf.bfloat16.
+    if self._use_bfloat16:
+      image = tf.cast(image, dtype=tf.bfloat16)
+
+    # Sets up groundtruth data for evaluation.
+    groundtruths = {
+        'source_id': data['source_id'],
+        'height': data['height'],
+        'width': data['width'],
+        'num_detections': tf.shape(data['groundtruth_classes']),
+        'image_info': image_info,
+        'boxes': box_ops.denormalize_boxes(
+            data['groundtruth_boxes'], image_shape),
+        'classes': data['groundtruth_classes'],
+        'areas': data['groundtruth_area'],
+        'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
+    }
+    groundtruths['source_id'] = utils.process_source_id(
+        groundtruths['source_id'])
+    groundtruths = utils.pad_groundtruths_to_fixed_size(
+        groundtruths, self._max_num_instances)
+
+    # Packs labels for model_fn outputs.
+    labels = {
+        'cls_targets': cls_targets,
+        'box_targets': box_targets,
+        'anchor_boxes': anchor_boxes,
+        'cls_weights': cls_weights,
+        'box_weights': box_weights,
+        'image_info': image_info,
+        'groundtruths': groundtruths,
+    }
+    return image, labels
--- a/official/vision/beta/dataloaders/retinanet_input_test.py
+++ b/official/vision/beta/dataloaders/retinanet_input_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for retinanet_parser.py."""
+
+# Import libraries
+from absl.testing import parameterized
+
+import tensorflow as tf
+from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.vision.beta.dataloaders import retinanet_input
+from official.vision.beta.dataloaders import tf_example_decoder
+
+
+class RetinaNetInputTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ([512, 640], True, True, True),
+      ([640, 640], False, False, False),
+  )
+  def testRetinanetInputReader(self,
+                               output_size,
+                               skip_crowd_during_training,
+                               use_autoaugment,
+                               is_training):
+
+    batch_size = 2
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [0.5, 1.0, 2.0]
+    anchor_size = 3
+    max_num_instances = 100
+
+    params = cfg.DataConfig(
+        input_path='/placer/prod/home/snaggletooth/test/data/coco/val*',
+        global_batch_size=batch_size,
+        is_training=is_training)
+
+    decoder = tf_example_decoder.TfExampleDecoder()
+    parser = retinanet_input.Parser(
+        output_size=output_size,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size,
+        skip_crowd_during_training=skip_crowd_during_training,
+        use_autoaugment=use_autoaugment,
+        max_num_instances=max_num_instances,
+        dtype='bfloat16')
+
+    reader = input_reader.InputReader(
+        params,
+        dataset_fn=tf.data.TFRecordDataset,
+        decoder_fn=decoder.decode,
+        parser_fn=parser.parse_fn(params.is_training))
+
+    dataset = reader.read()
+
+    iterator = iter(dataset)
+    image, labels = next(iterator)
+    np_image = image.numpy()
+    np_labels = tf.nest.map_structure(lambda x: x.numpy(), labels)
+
+    # Checks image shape.
+    self.assertEqual(list(np_image.shape),
+                     [batch_size, output_size[0], output_size[1], 3])
+    # Checks keys in labels.
+    if is_training:
+      self.assertCountEqual(
+          np_labels.keys(),
+          ['cls_targets', 'box_targets', 'anchor_boxes', 'cls_weights',
+           'box_weights', 'image_info'])
+    else:
+      self.assertCountEqual(
+          np_labels.keys(),
+          ['cls_targets', 'box_targets', 'anchor_boxes', 'cls_weights',
+           'box_weights', 'groundtruths', 'image_info'])
+    # Checks shapes of `image_info` and `anchor_boxes`.
+    self.assertEqual(np_labels['image_info'].shape, (batch_size, 4, 2))
+    n_anchors = 0
+    for level in range(min_level, max_level + 1):
+      stride = 2 ** level
+      output_size_l = [output_size[0] / stride, output_size[1] / stride]
+      anchors_per_location = num_scales * len(aspect_ratios)
+      self.assertEqual(
+          list(np_labels['anchor_boxes'][level].shape),
+          [batch_size, output_size_l[0], output_size_l[1],
+           4 * anchors_per_location])
+      n_anchors += output_size_l[0] * output_size_l[1] * anchors_per_location
+    # Checks shapes of training objectives.
+    self.assertEqual(np_labels['cls_weights'].shape, (batch_size, n_anchors))
+    for level in range(min_level, max_level + 1):
+      stride = 2 ** level
+      output_size_l = [output_size[0] / stride, output_size[1] / stride]
+      anchors_per_location = num_scales * len(aspect_ratios)
+      self.assertEqual(
+          list(np_labels['cls_targets'][level].shape),
+          [batch_size, output_size_l[0], output_size_l[1],
+           anchors_per_location])
+      self.assertEqual(
+          list(np_labels['box_targets'][level].shape),
+          [batch_size, output_size_l[0], output_size_l[1],
+           4 * anchors_per_location])
+    # Checks shape of groundtruths for eval.
+    if not is_training:
+      self.assertEqual(np_labels['groundtruths']['source_id'].shape,
+                       (batch_size,))
+      self.assertEqual(np_labels['groundtruths']['classes'].shape,
+                       (batch_size, max_num_instances))
+      self.assertEqual(np_labels['groundtruths']['boxes'].shape,
+                       (batch_size, max_num_instances, 4))
+      self.assertEqual(np_labels['groundtruths']['areas'].shape,
+                       (batch_size, max_num_instances))
+      self.assertEqual(np_labels['groundtruths']['is_crowds'].shape,
+                       (batch_size, max_num_instances))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/dataloaders/tf_example_decoder.py
+++ b/official/vision/beta/dataloaders/tf_example_decoder.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensorflow Example proto decoder for object detection.
+
+A decoder to decode string tensors containing serialized tensorflow.Example
+protos for object detection.
+"""
+import csv
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import decoder
+
+
+def _generate_source_id(image_bytes):
+  return tf.strings.as_string(
+      tf.strings.to_hash_bucket_fast(image_bytes, 2 ** 63 - 1))
+
+
+class TfExampleDecoder(decoder.Decoder):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self,
+               include_mask=False,
+               regenerate_source_id=False):
+    self._include_mask = include_mask
+    self._regenerate_source_id = regenerate_source_id
+    self._keys_to_features = {
+        'image/encoded': tf.io.FixedLenFeature((), tf.string),
+        'image/source_id': tf.io.FixedLenFeature((), tf.string),
+        'image/height': tf.io.FixedLenFeature((), tf.int64),
+        'image/width': tf.io.FixedLenFeature((), tf.int64),
+        'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
+        'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
+        'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
+        'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
+        'image/object/class/label': tf.io.VarLenFeature(tf.int64),
+        'image/object/area': tf.io.VarLenFeature(tf.float32),
+        'image/object/is_crowd': tf.io.VarLenFeature(tf.int64),
+    }
+    if include_mask:
+      self._keys_to_features.update({
+          'image/object/mask': tf.io.VarLenFeature(tf.string),
+      })
+
+  def _decode_image(self, parsed_tensors):
+    """Decodes the image and set its static shape."""
+    image = tf.io.decode_image(parsed_tensors['image/encoded'], channels=3)
+    image.set_shape([None, None, 3])
+    return image
+
+  def _decode_boxes(self, parsed_tensors):
+    """Concat box coordinates in the format of [ymin, xmin, ymax, xmax]."""
+    xmin = parsed_tensors['image/object/bbox/xmin']
+    xmax = parsed_tensors['image/object/bbox/xmax']
+    ymin = parsed_tensors['image/object/bbox/ymin']
+    ymax = parsed_tensors['image/object/bbox/ymax']
+    return tf.stack([ymin, xmin, ymax, xmax], axis=-1)
+
+  def _decode_classes(self, parsed_tensors):
+    return parsed_tensors['image/object/class/label']
+
+  def _decode_areas(self, parsed_tensors):
+    xmin = parsed_tensors['image/object/bbox/xmin']
+    xmax = parsed_tensors['image/object/bbox/xmax']
+    ymin = parsed_tensors['image/object/bbox/ymin']
+    ymax = parsed_tensors['image/object/bbox/ymax']
+    height = tf.cast(parsed_tensors['image/height'], dtype=tf.float32)
+    width = tf.cast(parsed_tensors['image/width'], dtype=tf.float32)
+    return tf.cond(
+        tf.greater(tf.shape(parsed_tensors['image/object/area'])[0], 0),
+        lambda: parsed_tensors['image/object/area'],
+        lambda: (xmax - xmin) * (ymax - ymin) * height * width)
+
+  def _decode_masks(self, parsed_tensors):
+    """Decode a set of PNG masks to the tf.float32 tensors."""
+
+    def _decode_png_mask(png_bytes):
+      mask = tf.squeeze(
+          tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1)
+      mask = tf.cast(mask, dtype=tf.float32)
+      mask.set_shape([None, None])
+      return mask
+
+    height = parsed_tensors['image/height']
+    width = parsed_tensors['image/width']
+    masks = parsed_tensors['image/object/mask']
+    return tf.cond(
+        pred=tf.greater(tf.size(input=masks), 0),
+        true_fn=lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32),
+        false_fn=lambda: tf.zeros([0, height, width], dtype=tf.float32))
+
+  def decode(self, serialized_example):
+    """Decode the serialized example.
+
+    Args:
+      serialized_example: a single serialized tf.Example string.
+
+    Returns:
+      decoded_tensors: a dictionary of tensors with the following fields:
+        - source_id: a string scalar tensor.
+        - image: a uint8 tensor of shape [None, None, 3].
+        - height: an integer scalar tensor.
+        - width: an integer scalar tensor.
+        - groundtruth_classes: a int64 tensor of shape [None].
+        - groundtruth_is_crowd: a bool tensor of shape [None].
+        - groundtruth_area: a float32 tensor of shape [None].
+        - groundtruth_boxes: a float32 tensor of shape [None, 4].
+        - groundtruth_instance_masks: a float32 tensor of shape
+            [None, None, None].
+        - groundtruth_instance_masks_png: a string tensor of shape [None].
+    """
+    parsed_tensors = tf.io.parse_single_example(
+        serialized=serialized_example, features=self._keys_to_features)
+    for k in parsed_tensors:
+      if isinstance(parsed_tensors[k], tf.SparseTensor):
+        if parsed_tensors[k].dtype == tf.string:
+          parsed_tensors[k] = tf.sparse.to_dense(
+              parsed_tensors[k], default_value='')
+        else:
+          parsed_tensors[k] = tf.sparse.to_dense(
+              parsed_tensors[k], default_value=0)
+
+    if self._regenerate_source_id:
+      source_id = _generate_source_id(parsed_tensors['image/encoded'])
+    else:
+      source_id = tf.cond(
+          tf.greater(tf.strings.length(parsed_tensors['image/source_id']), 0),
+          lambda: parsed_tensors['image/source_id'],
+          lambda: _generate_source_id(parsed_tensors['image/encoded']))
+    image = self._decode_image(parsed_tensors)
+    boxes = self._decode_boxes(parsed_tensors)
+    classes = self._decode_classes(parsed_tensors)
+    areas = self._decode_areas(parsed_tensors)
+    is_crowds = tf.cond(
+        tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0),
+        lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool),
+        lambda: tf.zeros_like(classes, dtype=tf.bool))
+    if self._include_mask:
+      masks = self._decode_masks(parsed_tensors)
+
+    decoded_tensors = {
+        'source_id': source_id,
+        'image': image,
+        'height': parsed_tensors['image/height'],
+        'width': parsed_tensors['image/width'],
+        'groundtruth_classes': classes,
+        'groundtruth_is_crowd': is_crowds,
+        'groundtruth_area': areas,
+        'groundtruth_boxes': boxes,
+    }
+    if self._include_mask:
+      decoded_tensors.update({
+          'groundtruth_instance_masks': masks,
+          'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'],
+      })
+    return decoded_tensors
+
+
+class TfExampleDecoderLabelMap(TfExampleDecoder):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self, label_map, include_mask=False, regenerate_source_id=False):
+    super(TfExampleDecoderLabelMap, self).__init__(
+        include_mask=include_mask, regenerate_source_id=regenerate_source_id)
+    self._keys_to_features.update({
+        'image/object/class/text': tf.io.VarLenFeature(tf.string),
+    })
+    name_to_id = self._process_label_map(label_map)
+    self._name_to_id_table = tf.lookup.StaticHashTable(
+        tf.lookup.KeyValueTensorInitializer(
+            keys=tf.constant(list(name_to_id.keys()), dtype=tf.string),
+            values=tf.constant(list(name_to_id.values()), dtype=tf.int64)),
+        default_value=-1)
+
+  def _process_label_map(self, label_map):
+    if label_map.endswith('.csv'):
+      name_to_id = self._process_csv(label_map)
+    else:
+      raise ValueError('The label map file is in incorrect format.')
+    return name_to_id
+
+  def _process_csv(self, label_map):
+    name_to_id = {}
+    with tf.io.gfile.GFile(label_map, 'r') as f:
+      reader = csv.reader(f, delimiter=',')
+      for row in reader:
+        if len(row) != 2:
+          raise ValueError('Each row of the csv label map file must be in '
+                           '`id,name` format. length = {}'.format(len(row)))
+        id_index = int(row[0])
+        name = row[1]
+        name_to_id[name] = id_index
+    return name_to_id
+
+  def _decode_classes(self, parsed_tensors):
+    return self._name_to_id_table.lookup(
+        parsed_tensors['image/object/class/text'])
--- a/official/vision/beta/dataloaders/tf_example_decoder_test.py
+++ b/official/vision/beta/dataloaders/tf_example_decoder_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf_example_decoder.py."""
+
+import io
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import tf_example_decoder
+
+
+DUMP_SOURCE_ID = b'123'
+
+
+def _encode_image(image_array, fmt):
+  image = Image.fromarray(image_array)
+  with io.BytesIO() as output:
+    image.save(output, format=fmt)
+    return output.getvalue()
+
+
+class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      (100, 100, 0, True),
+      (100, 100, 1, True),
+      (100, 100, 2, True),
+      (100, 100, 0, False),
+      (100, 100, 1, False),
+      (100, 100, 2, False),
+  )
+  def test_result_shape(self,
+                        image_height,
+                        image_width,
+                        num_instances,
+                        regenerate_source_id):
+    decoder = tf_example_decoder.TfExampleDecoder(
+        include_mask=True, regenerate_source_id=regenerate_source_id)
+
+    image = _encode_image(
+        np.uint8(np.random.rand(image_height, image_width, 3) * 255),
+        fmt='JPEG')
+    if num_instances == 0:
+      xmins = []
+      xmaxs = []
+      ymins = []
+      ymaxs = []
+      labels = []
+      areas = []
+      is_crowds = []
+      masks = []
+    else:
+      xmins = list(np.random.rand(num_instances))
+      xmaxs = list(np.random.rand(num_instances))
+      ymins = list(np.random.rand(num_instances))
+      ymaxs = list(np.random.rand(num_instances))
+      labels = list(np.random.randint(100, size=num_instances))
+      areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
+               for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
+      is_crowds = [0] * num_instances
+      masks = []
+      for _ in range(num_instances):
+        mask = _encode_image(
+            np.uint8(np.random.rand(image_height, image_width) * 255),
+            fmt='PNG')
+        masks.append(mask)
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/label': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=labels))),
+                'image/object/is_crowd': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=is_crowds))),
+                'image/object/area': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=areas))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(value=serialized_example))
+
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    if not regenerate_source_id:
+      self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+
+  def test_result_content(self):
+    decoder = tf_example_decoder.TfExampleDecoder(include_mask=True)
+
+    image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
+    image = _encode_image(np.uint8(image_content), fmt='PNG')
+    image_height = 4
+    image_width = 4
+    num_instances = 2
+    xmins = [0, 0.25]
+    xmaxs = [0.5, 1.0]
+    ymins = [0, 0]
+    ymaxs = [0.5, 1.0]
+    labels = [3, 1]
+    areas = [
+        0.25 * image_height * image_width, 0.75 * image_height * image_width
+    ]
+    is_crowds = [1, 0]
+    mask_content = [[[255, 255, 0, 0],
+                     [255, 255, 0, 0],
+                     [0, 0, 0, 0],
+                     [0, 0, 0, 0]],
+                    [[0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255]]]
+    masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/label': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=labels))),
+                'image/object/is_crowd': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=is_crowds))),
+                'image/object/area': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=areas))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(value=serialized_example))
+
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    self.assertAllEqual(image_content, results['image'])
+    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+    self.assertAllEqual(
+        [3, 1], results['groundtruth_classes'])
+    self.assertAllEqual(
+        [True, False], results['groundtruth_is_crowd'])
+    self.assertNDArrayNear(
+        [0.25 * image_height * image_width, 0.75 * image_height * image_width],
+        results['groundtruth_area'], 1e-4)
+    self.assertNDArrayNear(
+        [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
+        results['groundtruth_boxes'], 1e-4)
+    self.assertNDArrayNear(
+        mask_content, results['groundtruth_instance_masks'], 1e-4)
+    self.assertAllEqual(
+        masks, results['groundtruth_instance_masks_png'])
+
+  def test_handling_missing_fields(self):
+    decoder = tf_example_decoder.TfExampleDecoder(include_mask=True)
+
+    image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
+    image = _encode_image(np.uint8(image_content), fmt='PNG')
+    image_height = 4
+    image_width = 4
+    num_instances = 2
+    xmins = [0, 0.25]
+    xmaxs = [0.5, 1.0]
+    ymins = [0, 0]
+    ymaxs = [0.5, 1.0]
+    labels = [3, 1]
+    mask_content = [[[255, 255, 0, 0],
+                     [255, 255, 0, 0],
+                     [0, 0, 0, 0],
+                     [0, 0, 0, 0]],
+                    [[0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255]]]
+    masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/label': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=labels))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(serialized_example))
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    self.assertAllEqual(image_content, results['image'])
+    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+    self.assertAllEqual(
+        [3, 1], results['groundtruth_classes'])
+    self.assertAllEqual(
+        [False, False], results['groundtruth_is_crowd'])
+    self.assertNDArrayNear(
+        [0.25 * image_height * image_width, 0.75 * image_height * image_width],
+        results['groundtruth_area'], 1e-4)
+    self.assertNDArrayNear(
+        [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
+        results['groundtruth_boxes'], 1e-4)
+    self.assertNDArrayNear(
+        mask_content, results['groundtruth_instance_masks'], 1e-4)
+    self.assertAllEqual(
+        masks, results['groundtruth_instance_masks_png'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/dataloaders/tf_example_label_map_decoder.py
+++ b/official/vision/beta/dataloaders/tf_example_label_map_decoder.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensorflow Example proto decoder for object detection.
+
+A decoder to decode string tensors containing serialized tensorflow.Example
+protos for object detection.
+"""
+import csv
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import tf_example_decoder
+
+
+class TfExampleDecoderLabelMap(tf_example_decoder.TfExampleDecoder):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self, label_map, include_mask=False, regenerate_source_id=False):
+    super(TfExampleDecoderLabelMap, self).__init__(
+        include_mask=include_mask, regenerate_source_id=regenerate_source_id)
+    self._keys_to_features.update({
+        'image/object/class/text': tf.io.VarLenFeature(tf.string),
+    })
+    name_to_id = self._process_label_map(label_map)
+    self._name_to_id_table = tf.lookup.StaticHashTable(
+        tf.lookup.KeyValueTensorInitializer(
+            keys=tf.constant(list(name_to_id.keys()), dtype=tf.string),
+            values=tf.constant(list(name_to_id.values()), dtype=tf.int64)),
+        default_value=-1)
+
+  def _process_label_map(self, label_map):
+    if label_map.endswith('.csv'):
+      name_to_id = self._process_csv(label_map)
+    else:
+      raise ValueError('The label map file is in incorrect format.')
+    return name_to_id
+
+  def _process_csv(self, label_map):
+    name_to_id = {}
+    with tf.io.gfile.GFile(label_map, 'r') as f:
+      reader = csv.reader(f, delimiter=',')
+      for row in reader:
+        if len(row) != 2:
+          raise ValueError('Each row of the csv label map file must be in '
+                           '`id,name` format. length = {}'.format(len(row)))
+        id_index = int(row[0])
+        name = row[1]
+        name_to_id[name] = id_index
+    return name_to_id
+
+  def _decode_classes(self, parsed_tensors):
+    return self._name_to_id_table.lookup(
+        parsed_tensors['image/object/class/text'])
--- a/official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py
+++ b/official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf_example_label_map_decoder.py."""
+
+import io
+import os
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import tf_example_label_map_decoder
+
+
+DUMP_SOURCE_ID = b'123'
+LABEL_MAP_CSV_CONTENT = '0,class_0\n1,class_1\n2,class_2'
+
+
+def _encode_image(image_array, fmt):
+  image = Image.fromarray(image_array)
+  with io.BytesIO() as output:
+    image.save(output, format=fmt)
+    return output.getvalue()
+
+
+class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      (100, 100, 0),
+      (100, 100, 1),
+      (100, 100, 2),
+      (100, 100, 0),
+      (100, 100, 1),
+      (100, 100, 2),
+  )
+  def test_result_shape(self, image_height, image_width, num_instances):
+    label_map_dir = self.get_temp_dir()
+    label_map_name = 'label_map.csv'
+    label_map_path = os.path.join(label_map_dir, label_map_name)
+    with open(label_map_path, 'w') as f:
+      f.write(LABEL_MAP_CSV_CONTENT)
+
+    decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
+        label_map_path, include_mask=True)
+
+    image = _encode_image(
+        np.uint8(np.random.rand(image_height, image_width, 3) * 255),
+        fmt='JPEG')
+    if num_instances == 0:
+      xmins = []
+      xmaxs = []
+      ymins = []
+      ymaxs = []
+      labels = []
+      areas = []
+      is_crowds = []
+      masks = []
+    else:
+      xmins = list(np.random.rand(num_instances))
+      xmaxs = list(np.random.rand(num_instances))
+      ymins = list(np.random.rand(num_instances))
+      ymaxs = list(np.random.rand(num_instances))
+      labels = list(np.random.randint(100, size=num_instances))
+      areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
+               for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
+      is_crowds = [0] * num_instances
+      masks = []
+      labels = [b'class_1'] * num_instances
+      for _ in range(num_instances):
+        mask = _encode_image(
+            np.uint8(np.random.rand(image_height, image_width) * 255),
+            fmt='PNG')
+        masks.append(mask)
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/text': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=labels))),
+                'image/object/is_crowd': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=is_crowds))),
+                'image/object/area': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=areas))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(value=serialized_example))
+
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+
+  def test_result_content(self):
+    label_map_dir = self.get_temp_dir()
+    label_map_name = 'label_map.csv'
+    label_map_path = os.path.join(label_map_dir, label_map_name)
+    with open(label_map_path, 'w') as f:
+      f.write(LABEL_MAP_CSV_CONTENT)
+
+    decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
+        label_map_path, include_mask=True)
+
+    image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
+                     [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
+    image = _encode_image(np.uint8(image_content), fmt='PNG')
+    image_height = 4
+    image_width = 4
+    num_instances = 2
+    xmins = [0, 0.25]
+    xmaxs = [0.5, 1.0]
+    ymins = [0, 0]
+    ymaxs = [0.5, 1.0]
+    labels = [b'class_2', b'class_0']
+    areas = [
+        0.25 * image_height * image_width, 0.75 * image_height * image_width
+    ]
+    is_crowds = [1, 0]
+    mask_content = [[[255, 255, 0, 0],
+                     [255, 255, 0, 0],
+                     [0, 0, 0, 0],
+                     [0, 0, 0, 0]],
+                    [[0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255],
+                     [0, 255, 255, 255]]]
+    masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
+    serialized_example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_height]))),
+                'image/width': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmins))),
+                'image/object/bbox/xmax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/bbox/ymin': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymins))),
+                'image/object/bbox/ymax': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/class/text': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=labels))),
+                'image/object/is_crowd': (
+                    tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=is_crowds))),
+                'image/object/area': (
+                    tf.train.Feature(
+                        float_list=tf.train.FloatList(value=areas))),
+                'image/object/mask': (
+                    tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=masks))),
+            })).SerializeToString()
+    decoded_tensors = decoder.decode(
+        tf.convert_to_tensor(value=serialized_example))
+
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+
+    self.assertAllEqual(
+        (image_height, image_width, 3), results['image'].shape)
+    self.assertAllEqual(image_content, results['image'])
+    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(image_height, results['height'])
+    self.assertEqual(image_width, results['width'])
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_classes'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_is_crowd'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_area'].shape)
+    self.assertAllEqual(
+        (num_instances, 4), results['groundtruth_boxes'].shape)
+    self.assertAllEqual(
+        (num_instances, image_height, image_width),
+        results['groundtruth_instance_masks'].shape)
+    self.assertAllEqual(
+        (num_instances,), results['groundtruth_instance_masks_png'].shape)
+    self.assertAllEqual(
+        [2, 0], results['groundtruth_classes'])
+    self.assertAllEqual(
+        [True, False], results['groundtruth_is_crowd'])
+    self.assertNDArrayNear(
+        [0.25 * image_height * image_width, 0.75 * image_height * image_width],
+        results['groundtruth_area'], 1e-4)
+    self.assertNDArrayNear(
+        [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]],
+        results['groundtruth_boxes'], 1e-4)
+    self.assertNDArrayNear(
+        mask_content, results['groundtruth_instance_masks'], 1e-4)
+    self.assertAllEqual(
+        masks, results['groundtruth_instance_masks_png'])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/dataloaders/utils.py
+++ b/official/vision/beta/dataloaders/utils.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data loader utils."""
+
+# Import libraries
+import tensorflow as tf
+
+from official.vision.beta.ops import preprocess_ops
+
+
+def process_source_id(source_id):
+  """Processes source_id to the right format."""
+  if source_id.dtype == tf.string:
+    source_id = tf.cast(tf.strings.to_number(source_id), tf.int32)
+  with tf.control_dependencies([source_id]):
+    source_id = tf.cond(
+        pred=tf.equal(tf.size(input=source_id), 0),
+        true_fn=lambda: tf.cast(tf.constant(-1), tf.int32),
+        false_fn=lambda: tf.identity(source_id))
+  return source_id
+
+
+def pad_groundtruths_to_fixed_size(groundtruths, size):
+  """Pads the first dimension of groundtruths labels to the fixed size."""
+  groundtruths['boxes'] = preprocess_ops.clip_or_pad_to_fixed_size(
+      groundtruths['boxes'], size, -1)
+  groundtruths['is_crowds'] = preprocess_ops.clip_or_pad_to_fixed_size(
+      groundtruths['is_crowds'], size, 0)
+  groundtruths['areas'] = preprocess_ops.clip_or_pad_to_fixed_size(
+      groundtruths['areas'], size, -1)
+  groundtruths['classes'] = preprocess_ops.clip_or_pad_to_fixed_size(
+      groundtruths['classes'], size, -1)
+  return groundtruths
--- a/official/vision/beta/dataloaders/video_input.py
+++ b/official/vision/beta/dataloaders/video_input.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Parser for video and label datasets."""
+
+from typing import Dict, Optional, Tuple
+
+from absl import logging
+import tensorflow as tf
+
+from official.vision.beta.configs import video_classification as exp_cfg
+from official.vision.beta.dataloaders import decoder
+from official.vision.beta.dataloaders import parser
+from official.vision.beta.ops import preprocess_ops_3d
+
+IMAGE_KEY = 'image/encoded'
+LABEL_KEY = 'clip/label/index'
+
+
+def _process_image(image: tf.Tensor,
+                   is_training: bool = True,
+                   num_frames: int = 32,
+                   stride: int = 1,
+                   num_test_clips: int = 1,
+                   min_resize: int = 224,
+                   crop_size: int = 200,
+                   zero_centering_image: bool = False,
+                   seed: Optional[int] = None) -> tf.Tensor:
+  """Processes a serialized image tensor.
+
+  Args:
+    image: Input Tensor of shape [timesteps] and type tf.string of serialized
+      frames.
+    is_training: Whether or not in training mode. If True, random sample, crop
+      and left right flip is used.
+    num_frames: Number of frames per subclip.
+    stride: Temporal stride to sample frames.
+    num_test_clips: Number of test clips (1 by default). If more than 1, this
+      will sample multiple linearly spaced clips within each video at test time.
+      If 1, then a single clip in the middle of the video is sampled. The clips
+      are aggreagated in the batch dimension.
+    min_resize: Frames are resized so that min(height, width) is min_resize.
+    crop_size: Final size of the frame after cropping the resized frames. Both
+      height and width are the same.
+    zero_centering_image: If True, frames are normalized to values in [-1, 1].
+      If False, values in [0, 1].
+    seed: A deterministic seed to use when sampling.
+
+  Returns:
+    Processed frames. Tensor of shape
+      [num_frames * num_test_clips, crop_size, crop_size, 3].
+  """
+  # Validate parameters.
+  if is_training and num_test_clips != 1:
+    logging.warning(
+        '`num_test_clips` %d is ignored since `is_training` is `True`.',
+        num_test_clips)
+
+  # Temporal sampler.
+  if is_training:
+    # Sample random clip.
+    image = preprocess_ops_3d.sample_sequence(image, num_frames, True, stride,
+                                              seed)
+  elif num_test_clips > 1:
+    # Sample linspace clips.
+    image = preprocess_ops_3d.sample_linspace_sequence(image, num_test_clips,
+                                                       num_frames, stride)
+  else:
+    # Sample middle clip.
+    image = preprocess_ops_3d.sample_sequence(image, num_frames, False, stride)
+
+  # Decode JPEG string to tf.uint8.
+  image = preprocess_ops_3d.decode_jpeg(image, 3)
+
+  # Resize images (resize happens only if necessary to save compute).
+  image = preprocess_ops_3d.resize_smallest(image, min_resize)
+
+  if is_training:
+    # Standard image data augmentation: random crop and random flip.
+    image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, True,
+                                         seed)
+    image = preprocess_ops_3d.random_flip_left_right(image, seed)
+  else:
+    # Central crop of the frames.
+    image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, False)
+
+  # Cast the frames in float32, normalizing according to zero_centering_image.
+  return preprocess_ops_3d.normalize_image(image, zero_centering_image)
+
+
+def _postprocess_image(image: tf.Tensor,
+                       is_training: bool = True,
+                       num_frames: int = 32,
+                       num_test_clips: int = 1) -> tf.Tensor:
+  """Processes a batched Tensor of frames.
+
+  The same parameters used in process should be used here.
+
+  Args:
+    image: Input Tensor of shape [batch, timesteps, height, width, 3].
+    is_training: Whether or not in training mode. If True, random sample, crop
+      and left right flip is used.
+    num_frames: Number of frames per subclip.
+    num_test_clips: Number of test clips (1 by default). If more than 1, this
+      will sample multiple linearly spaced clips within each video at test time.
+      If 1, then a single clip in the middle of the video is sampled. The clips
+      are aggreagated in the batch dimension.
+
+  Returns:
+    Processed frames. Tensor of shape
+      [batch * num_test_clips, num_frames, height, width, 3].
+  """
+  if num_test_clips > 1 and not is_training:
+    # In this case, multiple clips are merged together in batch dimenstion which
+    # will be B * num_test_clips.
+    image = tf.reshape(
+        image, (-1, num_frames, image.shape[2], image.shape[3], image.shape[4]))
+
+  return image
+
+
+def _process_label(label: tf.Tensor,
+                   one_hot_label: bool = True,
+                   num_classes: Optional[int] = None) -> tf.Tensor:
+  """Processes label Tensor."""
+  # Validate parameters.
+  if one_hot_label and not num_classes:
+    raise ValueError(
+        '`num_classes` should be given when requesting one hot label.')
+
+  # Cast to tf.int32.
+  label = tf.cast(label, dtype=tf.int32)
+
+  if one_hot_label:
+    # Replace label index by one hot representation.
+    label = tf.one_hot(label, num_classes)
+
+  return label
+
+
+class Decoder(decoder.Decoder):
+  """A tf.Example decoder for classification task."""
+
+  def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY):
+    self._image_key = IMAGE_KEY
+    self._label_key = LABEL_KEY
+    self._context_description = {
+        # One integer stored in context.
+        self._label_key: tf.io.FixedLenFeature((), tf.int64),
+    }
+    self._sequence_description = {
+        # Each image is a string encoding JPEG.
+        self._image_key: tf.io.FixedLenSequenceFeature((), tf.string),
+    }
+
+  def decode(self, serialized_example):
+    """Parses a single tf.Example into image and label tensors."""
+    context, sequences = tf.io.parse_single_sequence_example(
+        serialized_example, self._context_description,
+        self._sequence_description)
+    return {
+        self._image_key: sequences[self._image_key],
+        self._label_key: context[self._label_key]
+    }
+
+
+class Parser(parser.Parser):
+  """Parses a video and label dataset."""
+
+  def __init__(self,
+               input_params: exp_cfg.DataConfig,
+               image_key: str = IMAGE_KEY,
+               label_key: str = LABEL_KEY):
+    self._num_frames = input_params.feature_shape[0]
+    self._stride = input_params.temporal_stride
+    self._num_test_clips = input_params.num_test_clips
+    self._min_resize = input_params.min_image_size
+    self._crop_size = input_params.feature_shape[1]
+    self._one_hot_label = input_params.one_hot
+    self._num_classes = input_params.num_classes
+    self._image_key = image_key
+    self._label_key = label_key
+
+  def _parse_train_data(
+      self, decoded_tensors: Dict[str, tf.Tensor]
+  ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
+    """Parses data for training."""
+    # Process image and label.
+    image = decoded_tensors[self._image_key]
+    label = decoded_tensors[self._label_key]
+    image = _process_image(
+        image=image,
+        is_training=True,
+        num_frames=self._num_frames,
+        stride=self._stride,
+        num_test_clips=self._num_test_clips,
+        min_resize=self._min_resize,
+        crop_size=self._crop_size)
+    label = _process_label(label, self._one_hot_label, self._num_classes)
+
+    return {'image': image}, label
+
+  def _parse_eval_data(
+      self, decoded_tensors: Dict[str, tf.Tensor]
+  ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
+    """Parses data for evaluation."""
+    image = decoded_tensors[self._image_key]
+    label = decoded_tensors[self._label_key]
+    image = _process_image(
+        image=image,
+        is_training=False,
+        num_frames=self._num_frames,
+        stride=self._stride,
+        num_test_clips=self._num_test_clips,
+        min_resize=self._min_resize,
+        crop_size=self._crop_size)
+    label = _process_label(label, self._one_hot_label, self._num_classes)
+
+    return {'image': image}, label
+
+
+class PostBatchProcessor(object):
+  """Processes a video and label dataset which is batched."""
+
+  def __init__(self, input_params: exp_cfg.DataConfig):
+    self._is_training = input_params.is_training
+
+    self._num_frames = input_params.feature_shape[0]
+    self._num_test_clips = input_params.num_test_clips
+
+  def __call__(
+      self,
+      image: Dict[str, tf.Tensor],
+      label: tf.Tensor) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
+    """Parses a single tf.Example into image and label tensors."""
+    image = image['image']
+    image = _postprocess_image(
+        image=image,
+        is_training=self._is_training,
+        num_frames=self._num_frames,
+        num_test_clips=self._num_test_clips)
+
+    return {'image': image}, label
--- a/official/vision/beta/dataloaders/video_input_test.py
+++ b/official/vision/beta/dataloaders/video_input_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import io
+
+# Import libraries
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from official.vision.beta.configs import video_classification as exp_cfg
+from official.vision.beta.dataloaders import video_input
+
+
+class DecoderTest(tf.test.TestCase):
+  """A tf.SequenceExample decoder for the video classification task."""
+
+  def test_decoder(self):
+    decoder = video_input.Decoder()
+
+    # Create fake data.
+    random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
+    random_image = Image.fromarray(random_image)
+    label = 42
+    with io.BytesIO() as buffer:
+      random_image.save(buffer, format='JPEG')
+      raw_image_bytes = buffer.getvalue()
+
+    seq_example = tf.train.SequenceExample()
+    seq_example.feature_lists.feature_list.get_or_create(
+        video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
+            raw_image_bytes
+        ]
+    seq_example.feature_lists.feature_list.get_or_create(
+        video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
+            raw_image_bytes
+        ]
+    seq_example.context.feature[video_input.LABEL_KEY].int64_list.value[:] = [
+        label
+    ]
+    serialized_example = seq_example.SerializeToString()
+
+    decoded_tensors = decoder.decode(tf.convert_to_tensor(serialized_example))
+    results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors)
+    self.assertCountEqual([video_input.IMAGE_KEY, video_input.LABEL_KEY],
+                          results.keys())
+    self.assertEqual(label, results[video_input.LABEL_KEY])
+
+
+class VideoAndLabelParserTest(tf.test.TestCase):
+
+  def test_video_input(self):
+    params = exp_cfg.kinetics600(is_training=True)
+    params.feature_shape = (2, 224, 224, 3)
+    params.min_image_size = 224
+    decoder = video_input.Decoder()
+    parser = video_input.Parser(params).parse_fn(params.is_training)
+
+    # Create fake data.
+    random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
+    random_image = Image.fromarray(random_image)
+    with io.BytesIO() as buffer:
+      random_image.save(buffer, format='JPEG')
+      raw_image_bytes = buffer.getvalue()
+
+    seq_example = tf.train.SequenceExample()
+    seq_example.feature_lists.feature_list.get_or_create(
+        video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
+            raw_image_bytes
+        ]
+    seq_example.feature_lists.feature_list.get_or_create(
+        video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [
+            raw_image_bytes
+        ]
+    seq_example.context.feature[video_input.LABEL_KEY].int64_list.value[:] = [
+        42
+    ]
+
+    input_tensor = tf.constant(seq_example.SerializeToString())
+    decoded_tensors = decoder.decode(input_tensor)
+    output_tensor = parser(decoded_tensors)
+    image_features, label = output_tensor
+    image = image_features['image']
+
+    self.assertAllEqual(image.shape, (2, 224, 224, 3))
+    self.assertAllEqual(label.shape, (600,))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/evaluation/coco_evaluator.py
+++ b/official/vision/beta/evaluation/coco_evaluator.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The COCO-style evaluator.
+
+The following snippet demonstrates the use of interfaces:
+
+  evaluator = COCOEvaluator(...)
+  for _ in range(num_evals):
+    for _ in range(num_batches_per_eval):
+      predictions, groundtruth = predictor.predict(...)  # pop a batch.
+      evaluator.update_state(groundtruths, predictions)
+    evaluator.result()  # finish one full eval and reset states.
+
+See also: https://github.com/cocodataset/cocoapi/
+"""
+
+import atexit
+import tempfile
+# Import libraries
+from absl import logging
+import numpy as np
+from pycocotools import cocoeval
+import six
+import tensorflow as tf
+
+from official.vision.beta.evaluation import coco_utils
+
+
+class COCOEvaluator(object):
+  """COCO evaluation metric class."""
+
+  def __init__(self, annotation_file, include_mask, need_rescale_bboxes=True):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to COCO metrics_fn. The
+    _update_op() takes detections from each image and push them to
+    self.detections. The _evaluate() loads a JSON file in COCO annotation format
+    as the groundtruths and runs COCO evaluation.
+
+    Args:
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        If `annotation_file` is None, groundtruth annotations will be loaded
+        from the dataloader.
+      include_mask: a boolean to indicate whether or not to include the mask
+        eval.
+      need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
+        to absolute values (`image_info` is needed in this case).
+    """
+    if annotation_file:
+      if annotation_file.startswith('gs://'):
+        _, local_val_json = tempfile.mkstemp(suffix='.json')
+        tf.io.gfile.remove(local_val_json)
+
+        tf.io.gfile.copy(annotation_file, local_val_json)
+        atexit.register(tf.io.gfile.remove, local_val_json)
+      else:
+        local_val_json = annotation_file
+      self._coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if include_mask else 'box'),
+          annotation_file=local_val_json)
+    self._annotation_file = annotation_file
+    self._include_mask = include_mask
+    self._metric_names = [
+        'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', 'ARmax10',
+        'ARmax100', 'ARs', 'ARm', 'ARl'
+    ]
+    self._required_prediction_fields = [
+        'source_id', 'num_detections', 'detection_classes', 'detection_scores',
+        'detection_boxes'
+    ]
+    self._need_rescale_bboxes = need_rescale_bboxes
+    if self._need_rescale_bboxes:
+      self._required_prediction_fields.append('image_info')
+    self._required_groundtruth_fields = [
+        'source_id', 'height', 'width', 'classes', 'boxes'
+    ]
+    if self._include_mask:
+      mask_metric_names = ['mask_' + x for x in self._metric_names]
+      self._metric_names.extend(mask_metric_names)
+      self._required_prediction_fields.extend(['detection_masks'])
+      self._required_groundtruth_fields.extend(['masks'])
+
+    self.reset_states()
+
+  @property
+  def name(self):
+    return 'coco_metric'
+
+  def reset_states(self):
+    """Resets internal states for a fresh run."""
+    self._predictions = {}
+    if not self._annotation_file:
+      self._groundtruths = {}
+
+  def result(self):
+    """Evaluates detection results, and reset_states."""
+    metric_dict = self.evaluate()
+    # Cleans up the internal variables in order for a fresh eval next time.
+    self.reset_states()
+    return metric_dict
+
+  def evaluate(self):
+    """Evaluates with detections from all images with COCO API.
+
+    Returns:
+      coco_metric: float numpy array with shape [24] representing the
+        coco-style evaluation metrics (box and mask).
+    """
+    if not self._annotation_file:
+      logging.info('There is no annotation_file in COCOEvaluator.')
+      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
+          self._groundtruths)
+      coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if self._include_mask else 'box'),
+          gt_dataset=gt_dataset)
+    else:
+      logging.info('Using annotation file: %s', self._annotation_file)
+      coco_gt = self._coco_gt
+    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
+        self._predictions)
+    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
+    image_ids = [ann['image_id'] for ann in coco_predictions]
+
+    coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox')
+    coco_eval.params.imgIds = image_ids
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm')
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      mask_coco_metrics = mcoco_eval.stats
+
+    if self._include_mask:
+      metrics = np.hstack((coco_metrics, mask_coco_metrics))
+    else:
+      metrics = coco_metrics
+
+    metrics_dict = {}
+    for i, name in enumerate(self._metric_names):
+      metrics_dict[name] = metrics[i].astype(np.float32)
+    return metrics_dict
+
+  def _process_predictions(self, predictions):
+    image_scale = np.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2))
+    predictions['detection_boxes'] = (
+        predictions['detection_boxes'].astype(np.float32))
+    predictions['detection_boxes'] /= image_scale
+    if 'detection_outer_boxes' in predictions:
+      predictions['detection_outer_boxes'] = (
+          predictions['detection_outer_boxes'].astype(np.float32))
+      predictions['detection_outer_boxes'] /= image_scale
+
+  def _convert_to_numpy(self, groundtruths, predictions):
+    """Converts tesnors to numpy arrays."""
+    if groundtruths:
+      labels = tf.nest.map_structure(lambda x: x.numpy(), groundtruths)
+      numpy_groundtruths = {}
+      for key, val in labels.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_groundtruths[key] = val
+    else:
+      numpy_groundtruths = groundtruths
+
+    if predictions:
+      outputs = tf.nest.map_structure(lambda x: x.numpy(), predictions)
+      numpy_predictions = {}
+      for key, val in outputs.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_predictions[key] = val
+    else:
+      numpy_predictions = predictions
+
+    return numpy_groundtruths, numpy_predictions
+
+  def update_state(self, groundtruths, predictions):
+    """Update and aggregate detection results and groundtruth data.
+
+    Args:
+      groundtruths: a dictionary of Tensors including the fields below.
+        See also different parsers under `../dataloader` for more details.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - height: a numpy array of int of shape [batch_size].
+          - width: a numpy array of int of shape [batch_size].
+          - num_detections: a numpy array of int of shape [batch_size].
+          - boxes: a numpy array of float of shape [batch_size, K, 4].
+          - classes: a numpy array of int of shape [batch_size, K].
+        Optional fields:
+          - is_crowds: a numpy array of int of shape [batch_size, K]. If the
+              field is absent, it is assumed that this instance is not crowd.
+          - areas: a numy array of float of shape [batch_size, K]. If the
+              field is absent, the area is calculated using either boxes or
+              masks depending on which one is available.
+          - masks: a numpy array of float of shape
+              [batch_size, K, mask_height, mask_width],
+      predictions: a dictionary of tensors including the fields below.
+        See different parsers under `../dataloader` for more details.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - image_info [if `need_rescale_bboxes` is True]: a numpy array of
+            float of shape [batch_size, 4, 2].
+          - num_detections: a numpy array of
+            int of shape [batch_size].
+          - detection_boxes: a numpy array of float of shape [batch_size, K, 4].
+          - detection_classes: a numpy array of int of shape [batch_size, K].
+          - detection_scores: a numpy array of float of shape [batch_size, K].
+        Optional fields:
+          - detection_masks: a numpy array of float of shape
+              [batch_size, K, mask_height, mask_width].
+    Raises:
+      ValueError: if the required prediction or groundtruth fields are not
+        present in the incoming `predictions` or `groundtruths`.
+    """
+    groundtruths, predictions = self._convert_to_numpy(groundtruths,
+                                                       predictions)
+    for k in self._required_prediction_fields:
+      if k not in predictions:
+        raise ValueError(
+            'Missing the required key `{}` in predictions!'.format(k))
+    if self._need_rescale_bboxes:
+      self._process_predictions(predictions)
+    for k, v in six.iteritems(predictions):
+      if k not in self._predictions:
+        self._predictions[k] = [v]
+      else:
+        self._predictions[k].append(v)
+
+    if not self._annotation_file:
+      assert groundtruths
+      for k in self._required_groundtruth_fields:
+        if k not in groundtruths:
+          raise ValueError(
+              'Missing the required key `{}` in groundtruths!'.format(k))
+      for k, v in six.iteritems(groundtruths):
+        if k not in self._groundtruths:
+          self._groundtruths[k] = [v]
+        else:
+          self._groundtruths[k].append(v)
--- a/official/vision/beta/evaluation/coco_evaluator_test.py
+++ b/official/vision/beta/evaluation/coco_evaluator_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for coco_evaluator."""
+
+import io
+import os
+
+# Import libraries
+
+from absl import logging
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+from pycocotools.coco import COCO
+import six
+import tensorflow as tf
+
+from official.vision.beta.evaluation import coco_evaluator
+from official.vision.beta.evaluation import coco_utils
+
+_COCO_JSON_FILE = '/placer/prod/home/snaggletooth/test/data/coco/instances_val2017.json'
+_SAVED_COCO_JSON_FILE = 'tmp.json'
+
+
+def get_groundtruth_annotations(image_id, coco, include_mask=False):
+  anns = coco.loadAnns(coco.getAnnIds([image_id]))
+  if not anns:
+    return None
+
+  image = coco.loadImgs([image_id])[0]
+
+  groundtruths = {
+      'boxes': [],
+      'classes': [],
+      'is_crowds': [],
+      'areas': [],
+  }
+  if include_mask:
+    groundtruths['masks'] = []
+  for ann in anns:
+    # Creates detections from groundtruths.
+    # Converts [x, y, w, h] to [y1, x1, y2, x2] box format.
+    box = [ann['bbox'][1],
+           ann['bbox'][0],
+           (ann['bbox'][1] + ann['bbox'][3]),
+           (ann['bbox'][0] + ann['bbox'][2])]
+
+    # Creates groundtruths.
+    groundtruths['boxes'].append(box)
+    groundtruths['classes'].append(ann['category_id'])
+    groundtruths['is_crowds'].append(ann['iscrowd'])
+    groundtruths['areas'].append(ann['area'])
+    if include_mask:
+      mask_img = Image.fromarray(coco.annToMask(ann).astype(np.uint8))
+      with io.BytesIO() as stream:
+        mask_img.save(stream, format='PNG')
+        mask_bytes = stream.getvalue()
+      groundtruths['masks'].append(mask_bytes)
+  for key, val in groundtruths.items():
+    groundtruths[key] = np.stack(val, axis=0)
+  groundtruths['source_id'] = image['id']
+  groundtruths['height'] = image['height']
+  groundtruths['width'] = image['width']
+  groundtruths['num_detections'] = len(anns)
+
+  for k, v in six.iteritems(groundtruths):
+    groundtruths[k] = np.expand_dims(v, axis=0)
+  return groundtruths
+
+
+def get_predictions(image_id, coco, include_mask=False):
+  anns = coco.loadAnns(coco.getAnnIds([image_id]))
+  if not anns:
+    return None
+
+  image = coco.loadImgs([image_id])[0]
+
+  predictions = {
+      'detection_boxes': [],
+      'detection_classes': [],
+      'detection_scores': [],
+  }
+  if include_mask:
+    predictions['detection_masks'] = []
+  for ann in anns:
+    # Creates detections from groundtruths.
+    # Converts [x, y, w, h] to [y1, x1, y2, x2] box format and
+    # does the denormalization.
+    box = [ann['bbox'][1],
+           ann['bbox'][0],
+           (ann['bbox'][1] + ann['bbox'][3]),
+           (ann['bbox'][0] + ann['bbox'][2])]
+
+    predictions['detection_boxes'].append(box)
+    predictions['detection_classes'].append(ann['category_id'])
+    predictions['detection_scores'].append(1)
+    if include_mask:
+      mask = coco.annToMask(ann)
+      predictions['detection_masks'].append(mask)
+  for key, val in predictions.items():
+    predictions[key] = np.expand_dims(np.stack(val, axis=0), axis=0)
+
+  predictions['source_id'] = np.array([image['id']])
+  predictions['num_detections'] = np.array([len(anns)])
+  predictions['image_info'] = np.array(
+      [[[image['height'], image['width']],
+        [image['height'], image['width']],
+        [1, 1],
+        [0, 0]]], dtype=np.float32)
+
+  return predictions
+
+
+def get_fake_predictions(image_id, coco, include_mask=False):
+  anns = coco.loadAnns(coco.getAnnIds([image_id]))
+  if not anns:
+    return None
+
+  label_id_max = max([ann['category_id'] for ann in anns])
+
+  image = coco.loadImgs([image_id])[0]
+
+  num_detections = 100
+  xmin = np.random.randint(
+      low=0, high=int(image['width'] / 2), size=(1, num_detections))
+  xmax = np.random.randint(
+      low=int(image['width'] / 2), high=image['width'],
+      size=(1, num_detections))
+  ymin = np.random.randint(
+      low=0, high=int(image['height'] / 2), size=(1, num_detections))
+  ymax = np.random.randint(
+      low=int(image['height'] / 2), high=image['height'],
+      size=(1, num_detections))
+  predictions = {
+      'detection_boxes': np.stack([ymin, xmin, ymax, xmax], axis=-1),
+      'detection_classes': np.random.randint(
+          low=0, high=(label_id_max + 1), size=(1, num_detections)),
+      'detection_scores': np.random.random(size=(1, num_detections)),
+  }
+  if include_mask:
+    predictions['detection_masks'] = np.random.randint(
+        1, size=(1, num_detections, image['height'], image['width']))
+  predictions['source_id'] = np.array([image['id']])
+  predictions['num_detections'] = np.array([num_detections])
+  predictions['image_info'] = np.array(
+      [[[image['height'], image['width']],
+        [image['height'], image['width']],
+        [1, 1],
+        [0, 0]]], dtype=np.float32)
+
+  return predictions
+
+
+class DummyGroundtruthGenerator(object):
+
+  def __init__(self, include_mask, image_id, coco):
+    self._include_mask = include_mask
+    self._image_id = image_id
+    self._coco = coco
+
+  def __call__(self):
+    yield get_groundtruth_annotations(
+        self._image_id, self._coco, self._include_mask)
+
+
+class COCOEvaluatorTest(parameterized.TestCase, absltest.TestCase):
+
+  def setUp(self):
+    super(COCOEvaluatorTest, self).setUp()
+    temp = self.create_tempdir()
+    self._saved_coco_json_file = os.path.join(temp.full_path,
+                                              _SAVED_COCO_JSON_FILE)
+
+  def tearDown(self):
+    super(COCOEvaluatorTest, self).tearDown()
+
+  @parameterized.parameters(
+      (False, False), (False, True), (True, False), (True, True))
+  def testEval(self, include_mask, use_fake_predictions):
+    coco = COCO(annotation_file=_COCO_JSON_FILE)
+    index = np.random.randint(len(coco.dataset['images']))
+    image_id = coco.dataset['images'][index]['id']
+    # image_id = 26564
+    # image_id = 324158
+    if use_fake_predictions:
+      predictions = get_fake_predictions(
+          image_id, coco, include_mask=include_mask)
+    else:
+      predictions = get_predictions(image_id, coco, include_mask=include_mask)
+
+    if not predictions:
+      logging.info('Empty predictions for index=%d', index)
+      return
+
+    predictions = tf.nest.map_structure(
+        lambda x: tf.convert_to_tensor(x) if x is not None else None,
+        predictions)
+
+    evaluator_w_json = coco_evaluator.COCOEvaluator(
+        annotation_file=_COCO_JSON_FILE, include_mask=include_mask)
+    evaluator_w_json.update_state(groundtruths=None, predictions=predictions)
+    results_w_json = evaluator_w_json.result()
+
+    dummy_generator = DummyGroundtruthGenerator(
+        include_mask=include_mask, image_id=image_id, coco=coco)
+    coco_utils.generate_annotation_file(dummy_generator,
+                                        self._saved_coco_json_file)
+    evaluator_no_json = coco_evaluator.COCOEvaluator(
+        annotation_file=self._saved_coco_json_file, include_mask=include_mask)
+    evaluator_no_json.update_state(groundtruths=None, predictions=predictions)
+    results_no_json = evaluator_no_json.result()
+
+    for k, v in results_w_json.items():
+      self.assertEqual(v, results_no_json[k])
+
+  @parameterized.parameters(
+      (False, False), (False, True), (True, False), (True, True))
+  def testEvalOnTheFly(self, include_mask, use_fake_predictions):
+    coco = COCO(annotation_file=_COCO_JSON_FILE)
+    index = np.random.randint(len(coco.dataset['images']))
+    image_id = coco.dataset['images'][index]['id']
+    # image_id = 26564
+    # image_id = 324158
+    if use_fake_predictions:
+      predictions = get_fake_predictions(
+          image_id, coco, include_mask=include_mask)
+    else:
+      predictions = get_predictions(image_id, coco, include_mask=include_mask)
+
+    if not predictions:
+      logging.info('Empty predictions for index=%d', index)
+      return
+
+    predictions = tf.nest.map_structure(
+        lambda x: tf.convert_to_tensor(x) if x is not None else None,
+        predictions)
+    evaluator_w_json = coco_evaluator.COCOEvaluator(
+        annotation_file=_COCO_JSON_FILE, include_mask=include_mask)
+    evaluator_w_json.update_state(groundtruths=None, predictions=predictions)
+    results_w_json = evaluator_w_json.result()
+
+    groundtruths = get_groundtruth_annotations(image_id, coco, include_mask)
+    groundtruths = tf.nest.map_structure(
+        lambda x: tf.convert_to_tensor(x) if x is not None else None,
+        groundtruths)
+
+    evaluator_no_json = coco_evaluator.COCOEvaluator(
+        annotation_file=None, include_mask=include_mask)
+    evaluator_no_json.update_state(groundtruths, predictions)
+    results_no_json = evaluator_no_json.result()
+
+    for k, v in results_w_json.items():
+      self.assertEqual(v, results_no_json[k])
+
+
+if __name__ == '__main__':
+  absltest.main()
--- a/official/vision/beta/evaluation/coco_utils.py
+++ b/official/vision/beta/evaluation/coco_utils.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Util functions related to pycocotools and COCO eval."""
+
+import copy
+import json
+
+# Import libraries
+from absl import logging
+import numpy as np
+from PIL import Image
+from pycocotools import coco
+from pycocotools import mask as mask_api
+import six
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import tf_example_decoder
+from official.vision.beta.ops import box_ops
+from official.vision.beta.ops import mask_ops
+
+
+class COCOWrapper(coco.COCO):
+  """COCO wrapper class.
+
+  This class wraps COCO API object, which provides the following additional
+  functionalities:
+    1. Support string type image id.
+    2. Support loading the groundtruth dataset using the external annotation
+       dictionary.
+    3. Support loading the prediction results using the external annotation
+       dictionary.
+  """
+
+  def __init__(self, eval_type='box', annotation_file=None, gt_dataset=None):
+    """Instantiates a COCO-style API object.
+
+    Args:
+      eval_type: either 'box' or 'mask'.
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        This is required if `gt_dataset` is not provided.
+      gt_dataset: the groundtruth eval datatset in COCO API format.
+    """
+    if ((annotation_file and gt_dataset) or
+        ((not annotation_file) and (not gt_dataset))):
+      raise ValueError('One and only one of `annotation_file` and `gt_dataset` '
+                       'needs to be specified.')
+
+    if eval_type not in ['box', 'mask']:
+      raise ValueError('The `eval_type` can only be either `box` or `mask`.')
+
+    coco.COCO.__init__(self, annotation_file=annotation_file)
+    self._eval_type = eval_type
+    if gt_dataset:
+      self.dataset = gt_dataset
+      self.createIndex()
+
+  def loadRes(self, predictions):
+    """Loads result file and return a result api object.
+
+    Args:
+      predictions: a list of dictionary each representing an annotation in COCO
+        format. The required fields are `image_id`, `category_id`, `score`,
+        `bbox`, `segmentation`.
+
+    Returns:
+      res: result COCO api object.
+
+    Raises:
+      ValueError: if the set of image id from predctions is not the subset of
+        the set of image id of the groundtruth dataset.
+    """
+    res = coco.COCO()
+    res.dataset['images'] = copy.deepcopy(self.dataset['images'])
+    res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+
+    image_ids = [ann['image_id'] for ann in predictions]
+    if set(image_ids) != (set(image_ids) & set(self.getImgIds())):
+      raise ValueError('Results do not correspond to the current dataset!')
+    for ann in predictions:
+      x1, x2, y1, y2 = [ann['bbox'][0], ann['bbox'][0] + ann['bbox'][2],
+                        ann['bbox'][1], ann['bbox'][1] + ann['bbox'][3]]
+      if self._eval_type == 'box':
+        ann['area'] = ann['bbox'][2] * ann['bbox'][3]
+        ann['segmentation'] = [
+            [x1, y1, x1, y2, x2, y2, x2, y1]]
+      elif self._eval_type == 'mask':
+        ann['area'] = mask_api.area(ann['segmentation'])
+
+    res.dataset['annotations'] = copy.deepcopy(predictions)
+    res.createIndex()
+    return res
+
+
+def convert_predictions_to_coco_annotations(predictions):
+  """Converts a batch of predictions to annotations in COCO format.
+
+  Args:
+    predictions: a dictionary of lists of numpy arrays including the following
+      fields. K below denotes the maximum number of instances per image.
+      Required fields:
+        - source_id: a list of numpy arrays of int or string of shape
+            [batch_size].
+        - num_detections: a list of numpy arrays of int of shape [batch_size].
+        - detection_boxes: a list of numpy arrays of float of shape
+            [batch_size, K, 4], where coordinates are in the original image
+            space (not the scaled image space).
+        - detection_classes: a list of numpy arrays of int of shape
+            [batch_size, K].
+        - detection_scores: a list of numpy arrays of float of shape
+            [batch_size, K].
+      Optional fields:
+        - detection_masks: a list of numpy arrays of float of shape
+            [batch_size, K, mask_height, mask_width].
+
+  Returns:
+    coco_predictions: prediction in COCO annotation format.
+  """
+  coco_predictions = []
+  num_batches = len(predictions['source_id'])
+  batch_size = predictions['source_id'][0].shape[0]
+  max_num_detections = predictions['detection_classes'][0].shape[1]
+  use_outer_box = 'detection_outer_boxes' in predictions
+  for i in range(num_batches):
+    predictions['detection_boxes'][i] = box_ops.yxyx_to_xywh(
+        predictions['detection_boxes'][i])
+    if use_outer_box:
+      predictions['detection_outer_boxes'][i] = box_ops.yxyx_to_xywh(
+          predictions['detection_outer_boxes'][i])
+      mask_boxes = predictions['detection_outer_boxes']
+    else:
+      mask_boxes = predictions['detection_boxes']
+
+    for j in range(batch_size):
+      if 'detection_masks' in predictions:
+        image_masks = mask_ops.paste_instance_masks(
+            predictions['detection_masks'][i][j],
+            mask_boxes[i][j],
+            int(predictions['image_info'][i][j, 0, 0]),
+            int(predictions['image_info'][i][j, 0, 1]))
+        binary_masks = (image_masks > 0.0).astype(np.uint8)
+        encoded_masks = [
+            mask_api.encode(np.asfortranarray(binary_mask))
+            for binary_mask in list(binary_masks)]
+      for k in range(max_num_detections):
+        ann = {}
+        ann['image_id'] = predictions['source_id'][i][j]
+        ann['category_id'] = predictions['detection_classes'][i][j, k]
+        ann['bbox'] = predictions['detection_boxes'][i][j, k]
+        ann['score'] = predictions['detection_scores'][i][j, k]
+        if 'detection_masks' in predictions:
+          ann['segmentation'] = encoded_masks[k]
+        coco_predictions.append(ann)
+
+  for i, ann in enumerate(coco_predictions):
+    ann['id'] = i + 1
+
+  return coco_predictions
+
+
+def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
+  """Converts groundtruths to the dataset in COCO format.
+
+  Args:
+    groundtruths: a dictionary of numpy arrays including the fields below.
+      Note that each element in the list represent the number for a single
+      example without batch dimension. K below denotes the actual number of
+      instances for each image.
+      Required fields:
+        - source_id: a list of numpy arrays of int or string of shape
+          [batch_size].
+        - height: a list of numpy arrays of int of shape [batch_size].
+        - width: a list of numpy arrays of int of shape [batch_size].
+        - num_detections: a list of numpy arrays of int of shape [batch_size].
+        - boxes: a list of numpy arrays of float of shape [batch_size, K, 4],
+            where coordinates are in the original image space (not the
+            normalized coordinates).
+        - classes: a list of numpy arrays of int of shape [batch_size, K].
+      Optional fields:
+        - is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If
+            th field is absent, it is assumed that this instance is not crowd.
+        - areas: a list of numy arrays of float of shape [batch_size, K]. If the
+            field is absent, the area is calculated using either boxes or
+            masks depending on which one is available.
+        - masks: a list of numpy arrays of string of shape [batch_size, K],
+    label_map: (optional) a dictionary that defines items from the category id
+      to the category name. If `None`, collect the category mappping from the
+      `groundtruths`.
+
+  Returns:
+    coco_groundtruths: the groundtruth dataset in COCO format.
+  """
+  source_ids = np.concatenate(groundtruths['source_id'], axis=0)
+  heights = np.concatenate(groundtruths['height'], axis=0)
+  widths = np.concatenate(groundtruths['width'], axis=0)
+  gt_images = [{'id': int(i), 'height': int(h), 'width': int(w)} for i, h, w
+               in zip(source_ids, heights, widths)]
+
+  gt_annotations = []
+  num_batches = len(groundtruths['source_id'])
+  batch_size = groundtruths['source_id'][0].shape[0]
+  for i in range(num_batches):
+    for j in range(batch_size):
+      num_instances = groundtruths['num_detections'][i][j]
+      for k in range(int(num_instances)):
+        ann = {}
+        ann['image_id'] = int(groundtruths['source_id'][i][j])
+        if 'is_crowds' in groundtruths:
+          ann['iscrowd'] = int(groundtruths['is_crowds'][i][j, k])
+        else:
+          ann['iscrowd'] = 0
+        ann['category_id'] = int(groundtruths['classes'][i][j, k])
+        boxes = groundtruths['boxes'][i]
+        ann['bbox'] = [
+            float(boxes[j, k, 1]),
+            float(boxes[j, k, 0]),
+            float(boxes[j, k, 3] - boxes[j, k, 1]),
+            float(boxes[j, k, 2] - boxes[j, k, 0])]
+        if 'areas' in groundtruths:
+          ann['area'] = float(groundtruths['areas'][i][j, k])
+        else:
+          ann['area'] = float(
+              (boxes[j, k, 3] - boxes[j, k, 1]) *
+              (boxes[j, k, 2] - boxes[j, k, 0]))
+        if 'masks' in groundtruths:
+          mask = Image.open(six.BytesIO(groundtruths['masks'][i][j, k]))
+          width, height = mask.size
+          np_mask = (
+              np.array(mask.getdata()).reshape(height, width).astype(np.uint8))
+          np_mask[np_mask > 0] = 255
+          encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
+          ann['segmentation'] = encoded_mask
+          if 'areas' not in groundtruths:
+            ann['area'] = mask_api.area(encoded_mask)
+        gt_annotations.append(ann)
+
+  for i, ann in enumerate(gt_annotations):
+    ann['id'] = i + 1
+
+  if label_map:
+    gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map]
+  else:
+    category_ids = [gt['category_id'] for gt in gt_annotations]
+    gt_categories = [{'id': i} for i in set(category_ids)]
+
+  gt_dataset = {
+      'images': gt_images,
+      'categories': gt_categories,
+      'annotations': copy.deepcopy(gt_annotations),
+  }
+  return gt_dataset
+
+
+class COCOGroundtruthGenerator:
+  """Generates the groundtruth annotations from a single example."""
+
+  def __init__(self, file_pattern, num_examples, include_mask):
+    self._file_pattern = file_pattern
+    self._num_examples = num_examples
+    self._include_mask = include_mask
+    self._dataset_fn = tf.data.TFRecordDataset
+
+  def _parse_single_example(self, example):
+    """Parses a single serialized tf.Example proto.
+
+    Args:
+      example: a serialized tf.Example proto string.
+
+    Returns:
+      A dictionary of groundtruth with the following fields:
+        source_id: a scalar tensor of int64 representing the image source_id.
+        height: a scalar tensor of int64 representing the image height.
+        width: a scalar tensor of int64 representing the image width.
+        boxes: a float tensor of shape [K, 4], representing the groundtruth
+          boxes in absolute coordinates with respect to the original image size.
+        classes: a int64 tensor of shape [K], representing the class labels of
+          each instances.
+        is_crowds: a bool tensor of shape [K], indicating whether the instance
+          is crowd.
+        areas: a float tensor of shape [K], indicating the area of each
+          instance.
+        masks: a string tensor of shape [K], containing the bytes of the png
+          mask of each instance.
+    """
+    decoder = tf_example_decoder.TfExampleDecoder(
+        include_mask=self._include_mask)
+    decoded_tensors = decoder.decode(example)
+
+    image = decoded_tensors['image']
+    image_size = tf.shape(image)[0:2]
+    boxes = box_ops.denormalize_boxes(
+        decoded_tensors['groundtruth_boxes'], image_size)
+    groundtruths = {
+        'source_id': tf.string_to_number(
+            decoded_tensors['source_id'], out_type=tf.int64),
+        'height': decoded_tensors['height'],
+        'width': decoded_tensors['width'],
+        'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0],
+        'boxes': boxes,
+        'classes': decoded_tensors['groundtruth_classes'],
+        'is_crowds': decoded_tensors['groundtruth_is_crowd'],
+        'areas': decoded_tensors['groundtruth_area'],
+    }
+    if self._include_mask:
+      groundtruths.update({
+          'masks': decoded_tensors['groundtruth_instance_masks_png'],
+      })
+    return groundtruths
+
+  def _build_pipeline(self):
+    """Builds data pipeline to generate groundtruth annotations."""
+    dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
+    dataset = dataset.interleave(
+        map_func=lambda filename: self._dataset_fn(filename).prefetch(1),
+        cycle_length=12,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    dataset = dataset.map(self._parse_single_example,
+                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    dataset = dataset.batch(1, drop_remainder=False)
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+    return dataset
+
+  def __call__(self):
+    for groundtruth_result in self._build_pipeline():
+      yield groundtruth_result
+
+
+def scan_and_generator_annotation_file(file_pattern: str,
+                                       num_samples: int,
+                                       include_mask: bool,
+                                       annotation_file: str):
+  """Scans and generate the COCO-style annotation JSON file given a dataset."""
+  groundtruth_generator = COCOGroundtruthGenerator(
+      file_pattern, num_samples, include_mask)
+  generate_annotation_file(groundtruth_generator, annotation_file)
+
+
+def generate_annotation_file(groundtruth_generator,
+                             annotation_file):
+  """Generates COCO-style annotation JSON file given a groundtruth generator."""
+  groundtruths = {}
+  logging.info('Loading groundtruth annotations from dataset to memory...')
+  for groundtruth in groundtruth_generator():
+    for k, v in six.iteritems(groundtruth):
+      if k not in groundtruths:
+        groundtruths[k] = [v]
+      else:
+        groundtruths[k].append(v)
+  gt_dataset = convert_groundtruths_to_coco_dataset(groundtruths)
+
+  logging.info('Saving groundtruth annotations to the JSON file...')
+  with tf.io.gfile.GFile(annotation_file, 'w') as f:
+    f.write(json.dumps(gt_dataset))
+  logging.info('Done saving the JSON file...')
--- a/official/vision/beta/losses/maskrcnn_losses.py
+++ b/official/vision/beta/losses/maskrcnn_losses.py
--- a/official/vision/beta/losses/retinanet_losses.py
+++ b/official/vision/beta/losses/retinanet_losses.py
--- a/official/vision/beta/modeling/backbones/__init__.py
+++ b/official/vision/beta/modeling/backbones/__init__.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Backbones package definition."""
+
+from official.vision.beta.modeling.backbones.efficientnet import EfficientNet
+from official.vision.beta.modeling.backbones.resnet import ResNet
+from official.vision.beta.modeling.backbones.resnet_3d import ResNet3D
+from official.vision.beta.modeling.backbones.revnet import RevNet
+from official.vision.beta.modeling.backbones.spinenet import SpineNet
--- a/official/vision/beta/modeling/backbones/efficientnet.py
+++ b/official/vision/beta/modeling/backbones/efficientnet.py
--- a/official/vision/beta/modeling/backbones/efficientnet_test.py
+++ b/official/vision/beta/modeling/backbones/efficientnet_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for EfficientNet."""
+
+# Import libraries
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.beta.modeling.backbones import efficientnet
+
+
+class EfficientNetTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(32, 224)
+  def test_network_creation(self, input_size):
+    """Test creation of EfficientNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    network = efficientnet.EfficientNet(model_id='b0')
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    endpoints = network(inputs)
+
+    self.assertAllEqual([1, input_size / 2**2, input_size / 2**2, 24],
+                        endpoints[2].shape.as_list())
+    self.assertAllEqual([1, input_size / 2**3, input_size / 2**3, 40],
+                        endpoints[3].shape.as_list())
+    self.assertAllEqual([1, input_size / 2**4, input_size / 2**4, 112],
+                        endpoints[4].shape.as_list())
+    self.assertAllEqual([1, input_size / 2**5, input_size / 2**5, 320],
+                        endpoints[5].shape.as_list())
+
+  @parameterized.parameters('b0', 'b3', 'b6')
+  def test_network_scaling(self, model_id):
+    """Test compound scaling."""
+    efficientnet_params = {
+        'b0': 4049564,
+        'b3': 10783528,
+        'b6': 40960136,
+    }
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    input_size = 32
+    network = efficientnet.EfficientNet(model_id=model_id, se_ratio=0.25)
+    self.assertEqual(network.count_params(), efficientnet_params[model_id])
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+
+  @parameterized.parameters(1, 3)
+  def test_input_specs(self, input_dim):
+    """Test different input feature dimensions."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim])
+    network = efficientnet.EfficientNet(model_id='b0', input_specs=input_specs)
+
+    inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1)
+    _ = network(inputs)
+
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        model_id='b0',
+        se_ratio=0.25,
+        stochastic_depth_drop_rate=None,
+        use_sync_bn=False,
+        kernel_initializer='VarianceScaling',
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activation='relu',
+        norm_momentum=0.99,
+        norm_epsilon=0.001,
+    )
+    network = efficientnet.EfficientNet(**kwargs)
+
+    expected_config = dict(kwargs)
+    self.assertEqual(network.get_config(), expected_config)
+
+    # Create another network object from the first object's config.
+    new_network = efficientnet.EfficientNet.from_config(network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = new_network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/modeling/backbones/factory.py
+++ b/official/vision/beta/modeling/backbones/factory.py
--- a/official/vision/beta/modeling/backbones/factory_test.py
+++ b/official/vision/beta/modeling/backbones/factory_test.py