Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

09d9656f · Srihari Humbarwadi · GitHub · ac671306 · 49a5706c · 09d9656f
Unverified Commit 09d9656f authored Jan 13, 2022 by Srihari Humbarwadi Committed by GitHub Jan 13, 2022
20 changed files
--- a/official/vision/beta/configs/semantic_segmentation.py
+++ b/official/vision/beta/configs/semantic_segmentation.py
@@ -76,6 +76,16 @@ class SegmentationHead(hyperparams.Config):
  decoder_max_level: Optional[Union[int, str]] = None
+@dataclasses.dataclass
+class MaskScoringHead(hyperparams.Config):
+  """Mask Scoring head config."""
+  num_convs: int = 4
+  num_filters: int = 128
+  fc_input_size: List[int] = dataclasses.field(default_factory=list)
+  num_fcs: int = 2
+  fc_dims: int = 1024
 @dataclasses.dataclass
 class SemanticSegmentationModel(hyperparams.Config):
  """Semantic segmentation model config."""
@@ -87,6 +97,7 @@ class SemanticSegmentationModel(hyperparams.Config):
  backbone: backbones.Backbone = backbones.Backbone(
      type='resnet', resnet=backbones.ResNet())
  decoder: decoders.Decoder = decoders.Decoder(type='identity')
+  mask_scoring_head: Optional[MaskScoringHead] = None
  norm_activation: common.NormActivation = common.NormActivation()

--- a/official/vision/beta/data/process_coco_panoptic.sh
+++ b/official/vision/beta/data/process_coco_panoptic.sh
+#!/bin/bash
+sudo apt update
+sudo apt install unzip aria2 -y
+DATA_DIR=$1
+aria2c -j 8 -Z \
+  http://images.cocodataset.org/annotations/annotations_trainval2017.zip \
+  http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip \
+  http://images.cocodataset.org/zips/train2017.zip \
+  http://images.cocodataset.org/zips/val2017.zip \
+  --dir=$DATA_DIR;
+unzip $DATA_DIR/"*".zip -d $DATA_DIR;
+mkdir $DATA_DIR/zips && mv $DATA_DIR/*.zip $DATA_DIR/zips;
+unzip $DATA_DIR/annotations/panoptic_train2017.zip -d $DATA_DIR
+unzip $DATA_DIR/annotations/panoptic_val2017.zip -d $DATA_DIR
+python3 official/vision/beta/data/create_coco_tf_record.py \
+  --logtostderr  \
+  --image_dir="$DATA_DIR/val2017" \
+  --object_annotations_file="$DATA_DIR/annotations/instances_val2017.json"  \
+  --output_file_prefix="$DATA_DIR/tfrecords/val"  \
+  --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_val2017.json" \
+  --panoptic_masks_dir="$DATA_DIR/panoptic_val2017" \
+  --num_shards=8 \
+  --include_masks \
+  --include_panoptic_masks
+python3 official/vision/beta/data/create_coco_tf_record.py \
+  --logtostderr  \
+  --image_dir="$DATA_DIR/train2017" \
+  --object_annotations_file="$DATA_DIR/annotations/instances_train2017.json"  \
+  --output_file_prefix="$DATA_DIR/tfrecords/train"  \
+  --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_train2017.json" \
+  --panoptic_masks_dir="$DATA_DIR/panoptic_train2017" \
+  --num_shards=32 \
+  --include_masks \
+  --include_panoptic_masks
--- a/official/vision/beta/dataloaders/parser.py
+++ b/official/vision/beta/dataloaders/parser.py
@@ -55,7 +55,7 @@ class Parser(object):
      is_training: a `bool` to indicate whether it is in training mode.
    Returns:
-      parse: a `callable` that takes the serialized examle and generate the
+      parse: a `callable` that takes the serialized example and generate the
        images, labels tuple where labels is a dict of Tensors that contains
        labels.
    """

--- a/official/vision/beta/dataloaders/retinanet_input.py
+++ b/official/vision/beta/dataloaders/retinanet_input.py
@@ -19,11 +19,13 @@ into (image, labels) tuple for RetinaNet.
 """
 # Import libraries
+from absl import logging
 import tensorflow as tf
 from official.vision.beta.dataloaders import parser
 from official.vision.beta.dataloaders import utils
 from official.vision.beta.ops import anchor
+from official.vision.beta.ops import augment
 from official.vision.beta.ops import box_ops
 from official.vision.beta.ops import preprocess_ops
@@ -40,6 +42,7 @@ class Parser(parser.Parser):
               anchor_size,
               match_threshold=0.5,
               unmatched_threshold=0.5,
+               aug_type=None,
               aug_rand_hflip=False,
               aug_scale_min=1.0,
               aug_scale_max=1.0,
@@ -71,6 +74,8 @@ class Parser(parser.Parser):
      unmatched_threshold: `float` number between 0 and 1 representing the
        upper-bound threshold to assign negative labels for anchors. An anchor
        with a score below the threshold is labeled negative.
+      aug_type: An optional Augmentation object to choose from AutoAugment and
+        RandAugment. The latter is not supported, and will raise ValueError.
      aug_rand_hflip: `bool`, if True, augment training with random horizontal
        flip.
      aug_scale_min: `float`, the minimum scale applied to `output_size` for
@@ -108,7 +113,20 @@ class Parser(parser.Parser):
    self._aug_scale_min = aug_scale_min
    self._aug_scale_max = aug_scale_max
-    # Data Augmentation with AutoAugment.
+    # Data augmentation with AutoAugment or RandAugment.
+    self._augmenter = None
+    if aug_type is not None:
+      if aug_type.type == 'autoaug':
+        logging.info('Using AutoAugment.')
+        self._augmenter = augment.AutoAugment(
+            augmentation_name=aug_type.autoaug.augmentation_name,
+            cutout_const=aug_type.autoaug.cutout_const,
+            translate_const=aug_type.autoaug.translate_const)
+      else:
+        # TODO(b/205346436) Support RandAugment.
+        raise ValueError(f'Augmentation policy {aug_type.type} not supported.')
+    # Deprecated. Data Augmentation with AutoAugment.
    self._use_autoaugment = use_autoaugment
    self._autoaugment_policy_name = autoaugment_policy_name
@@ -138,9 +156,13 @@ class Parser(parser.Parser):
      for k, v in attributes.items():
        attributes[k] = tf.gather(v, indices)
-    # Gets original image and its size.
+    # Gets original image.
    image = data['image']
+    # Apply autoaug or randaug.
+    if self._augmenter is not None:
+      image, boxes = self._augmenter.distort_with_boxes(image, boxes)
    image_shape = tf.shape(input=image)[0:2]
    # Normalizes image with mean and std pixel values.

--- a/official/vision/beta/dataloaders/tf_example_decoder.py
+++ b/official/vision/beta/dataloaders/tf_example_decoder.py
@@ -23,8 +23,9 @@ from official.vision.beta.dataloaders import decoder
 def _generate_source_id(image_bytes):
+  # Hashing using 22 bits since float32 has only 23 mantissa bits.
  return tf.strings.as_string(
-      tf.strings.to_hash_bucket_fast(image_bytes, 2 ** 63 - 1))
+      tf.strings.to_hash_bucket_fast(image_bytes, 2 ** 22 - 1))
 class TfExampleDecoder(decoder.Decoder):

--- a/official/vision/beta/dataloaders/tf_example_decoder_test.py
+++ b/official/vision/beta/dataloaders/tf_example_decoder_test.py
@@ -14,24 +14,13 @@
 """Tests for tf_example_decoder.py."""
-import io
 # Import libraries
 from absl.testing import parameterized
 import numpy as np
-from PIL import Image
 import tensorflow as tf
 from official.vision.beta.dataloaders import tf_example_decoder
+from official.vision.beta.dataloaders import tfexample_utils
-DUMP_SOURCE_ID = b'123'
-def _encode_image(image_array, fmt):
-  image = Image.fromarray(image_array)
-  with io.BytesIO() as output:
-    image.save(output, format=fmt)
-    return output.getvalue()
 class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
@@ -52,73 +41,11 @@ class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
    decoder = tf_example_decoder.TfExampleDecoder(
        include_mask=True, regenerate_source_id=regenerate_source_id)
-    image = _encode_image(
+    serialized_example = tfexample_utils.create_detection_test_example(
-        np.uint8(np.random.rand(image_height, image_width, 3) * 255),
+        image_height=image_height,
-        fmt='JPEG')
+        image_width=image_width,
-    if num_instances == 0:
+        image_channel=3,
-      xmins = []
+        num_instances=num_instances).SerializeToString()
-      xmaxs = []
-      ymins = []
-      ymaxs = []
-      labels = []
-      areas = []
-      is_crowds = []
-      masks = []
-    else:
-      xmins = list(np.random.rand(num_instances))
-      xmaxs = list(np.random.rand(num_instances))
-      ymins = list(np.random.rand(num_instances))
-      ymaxs = list(np.random.rand(num_instances))
-      labels = list(np.random.randint(100, size=num_instances))
-      areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
-               for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
-      is_crowds = [0] * num_instances
-      masks = []
-      for _ in range(num_instances):
-        mask = _encode_image(
-            np.uint8(np.random.rand(image_height, image_width) * 255),
-            fmt='PNG')
-        masks.append(mask)
-    serialized_example = tf.train.Example(
-        features=tf.train.Features(
-            feature={
-                'image/encoded': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=[image]))),
-                'image/source_id': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
-                'image/height': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=[image_height]))),
-                'image/width': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=[image_width]))),
-                'image/object/bbox/xmin': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=xmins))),
-                'image/object/bbox/xmax': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=xmaxs))),
-                'image/object/bbox/ymin': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=ymins))),
-                'image/object/bbox/ymax': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=ymaxs))),
-                'image/object/class/label': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=labels))),
-                'image/object/is_crowd': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=is_crowds))),
-                'image/object/area': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=areas))),
-                'image/object/mask': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=masks))),
-            })).SerializeToString()
    decoded_tensors = decoder.decode(
        tf.convert_to_tensor(value=serialized_example))
@@ -127,7 +54,7 @@ class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllEqual(
        (image_height, image_width, 3), results['image'].shape)
    if not regenerate_source_id:
-      self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+      self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
    self.assertEqual(image_height, results['height'])
    self.assertEqual(image_width, results['width'])
    self.assertAllEqual(
@@ -151,7 +78,7 @@ class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
                     [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
-    image = _encode_image(np.uint8(image_content), fmt='PNG')
+    image = tfexample_utils.encode_image(np.uint8(image_content), fmt='PNG')
    image_height = 4
    image_width = 4
    num_instances = 2
@@ -172,46 +99,38 @@ class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
                     [0, 255, 255, 255],
                     [0, 255, 255, 255],
                     [0, 255, 255, 255]]]
-    masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
+    masks = [
+        tfexample_utils.encode_image(np.uint8(m), fmt='PNG')
+        for m in list(mask_content)
+    ]
    serialized_example = tf.train.Example(
        features=tf.train.Features(
            feature={
-                'image/encoded': (
+                'image/encoded': (tf.train.Feature(
-                    tf.train.Feature(
+                    bytes_list=tf.train.BytesList(value=[image]))),
-                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (tf.train.Feature(
-                'image/source_id': (
+                    bytes_list=tf.train.BytesList(
-                    tf.train.Feature(
+                        value=[tfexample_utils.DUMP_SOURCE_ID]))),
-                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (tf.train.Feature(
-                'image/height': (
+                    int64_list=tf.train.Int64List(value=[image_height]))),
-                    tf.train.Feature(
+                'image/width': (tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=[image_height]))),
+                    int64_list=tf.train.Int64List(value=[image_width]))),
-                'image/width': (
+                'image/object/bbox/xmin': (tf.train.Feature(
-                    tf.train.Feature(
+                    float_list=tf.train.FloatList(value=xmins))),
-                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmax': (tf.train.Feature(
-                'image/object/bbox/xmin': (
+                    float_list=tf.train.FloatList(value=xmaxs))),
-                    tf.train.Feature(
+                'image/object/bbox/ymin': (tf.train.Feature(
-                        float_list=tf.train.FloatList(value=xmins))),
+                    float_list=tf.train.FloatList(value=ymins))),
-                'image/object/bbox/xmax': (
+                'image/object/bbox/ymax': (tf.train.Feature(
-                    tf.train.Feature(
+                    float_list=tf.train.FloatList(value=ymaxs))),
-                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/class/label': (tf.train.Feature(
-                'image/object/bbox/ymin': (
+                    int64_list=tf.train.Int64List(value=labels))),
-                    tf.train.Feature(
+                'image/object/is_crowd': (tf.train.Feature(
-                        float_list=tf.train.FloatList(value=ymins))),
+                    int64_list=tf.train.Int64List(value=is_crowds))),
-                'image/object/bbox/ymax': (
+                'image/object/area': (tf.train.Feature(
-                    tf.train.Feature(
+                    float_list=tf.train.FloatList(value=areas))),
-                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/mask': (tf.train.Feature(
-                'image/object/class/label': (
+                    bytes_list=tf.train.BytesList(value=masks))),
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=labels))),
-                'image/object/is_crowd': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=is_crowds))),
-                'image/object/area': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=areas))),
-                'image/object/mask': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=masks))),
            })).SerializeToString()
    decoded_tensors = decoder.decode(
        tf.convert_to_tensor(value=serialized_example))
@@ -221,7 +140,7 @@ class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllEqual(
        (image_height, image_width, 3), results['image'].shape)
    self.assertAllEqual(image_content, results['image'])
-    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
    self.assertEqual(image_height, results['height'])
    self.assertEqual(image_width, results['width'])
    self.assertAllEqual(
@@ -259,7 +178,7 @@ class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
                     [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
-    image = _encode_image(np.uint8(image_content), fmt='PNG')
+    image = tfexample_utils.encode_image(np.uint8(image_content), fmt='PNG')
    image_height = 4
    image_width = 4
    num_instances = 2
@@ -276,40 +195,34 @@ class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
                     [0, 255, 255, 255],
                     [0, 255, 255, 255],
                     [0, 255, 255, 255]]]
-    masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
+    masks = [
+        tfexample_utils.encode_image(np.uint8(m), fmt='PNG')
+        for m in list(mask_content)
+    ]
    serialized_example = tf.train.Example(
        features=tf.train.Features(
            feature={
-                'image/encoded': (
+                'image/encoded': (tf.train.Feature(
-                    tf.train.Feature(
+                    bytes_list=tf.train.BytesList(value=[image]))),
-                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (tf.train.Feature(
-                'image/source_id': (
+                    bytes_list=tf.train.BytesList(
-                    tf.train.Feature(
+                        value=[tfexample_utils.DUMP_SOURCE_ID]))),
-                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (tf.train.Feature(
-                'image/height': (
+                    int64_list=tf.train.Int64List(value=[image_height]))),
-                    tf.train.Feature(
+                'image/width': (tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=[image_height]))),
+                    int64_list=tf.train.Int64List(value=[image_width]))),
-                'image/width': (
+                'image/object/bbox/xmin': (tf.train.Feature(
-                    tf.train.Feature(
+                    float_list=tf.train.FloatList(value=xmins))),
-                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmax': (tf.train.Feature(
-                'image/object/bbox/xmin': (
+                    float_list=tf.train.FloatList(value=xmaxs))),
-                    tf.train.Feature(
+                'image/object/bbox/ymin': (tf.train.Feature(
-                        float_list=tf.train.FloatList(value=xmins))),
+                    float_list=tf.train.FloatList(value=ymins))),
-                'image/object/bbox/xmax': (
+                'image/object/bbox/ymax': (tf.train.Feature(
-                    tf.train.Feature(
+                    float_list=tf.train.FloatList(value=ymaxs))),
-                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/class/label': (tf.train.Feature(
-                'image/object/bbox/ymin': (
+                    int64_list=tf.train.Int64List(value=labels))),
-                    tf.train.Feature(
+                'image/object/mask': (tf.train.Feature(
-                        float_list=tf.train.FloatList(value=ymins))),
+                    bytes_list=tf.train.BytesList(value=masks))),
-                'image/object/bbox/ymax': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=ymaxs))),
-                'image/object/class/label': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=labels))),
-                'image/object/mask': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=masks))),
            })).SerializeToString()
    decoded_tensors = decoder.decode(
        tf.convert_to_tensor(serialized_example))
@@ -318,7 +231,7 @@ class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllEqual(
        (image_height, image_width, 3), results['image'].shape)
    self.assertAllEqual(image_content, results['image'])
-    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
    self.assertEqual(image_height, results['height'])
    self.assertEqual(image_width, results['width'])
    self.assertAllEqual(

--- a/official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py
+++ b/official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py
@@ -14,28 +14,19 @@
 """Tests for tf_example_label_map_decoder.py."""
-import io
 import os
 # Import libraries
 from absl.testing import parameterized
 import numpy as np
-from PIL import Image
 import tensorflow as tf
 from official.vision.beta.dataloaders import tf_example_label_map_decoder
+from official.vision.beta.dataloaders import tfexample_utils
-DUMP_SOURCE_ID = b'123'
 LABEL_MAP_CSV_CONTENT = '0,class_0\n1,class_1\n2,class_2'
-def _encode_image(image_array, fmt):
-  image = Image.fromarray(image_array)
-  with io.BytesIO() as output:
-    image.save(output, format=fmt)
-    return output.getvalue()
 class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.parameters(
@@ -56,74 +47,11 @@ class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
    decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
        label_map_path, include_mask=True)
-    image = _encode_image(
+    serialized_example = tfexample_utils.create_detection_test_example(
-        np.uint8(np.random.rand(image_height, image_width, 3) * 255),
+        image_height=image_height,
-        fmt='JPEG')
+        image_width=image_width,
-    if num_instances == 0:
+        image_channel=3,
-      xmins = []
+        num_instances=num_instances).SerializeToString()
-      xmaxs = []
-      ymins = []
-      ymaxs = []
-      labels = []
-      areas = []
-      is_crowds = []
-      masks = []
-    else:
-      xmins = list(np.random.rand(num_instances))
-      xmaxs = list(np.random.rand(num_instances))
-      ymins = list(np.random.rand(num_instances))
-      ymaxs = list(np.random.rand(num_instances))
-      labels = list(np.random.randint(100, size=num_instances))
-      areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
-               for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
-      is_crowds = [0] * num_instances
-      masks = []
-      labels = [b'class_1'] * num_instances
-      for _ in range(num_instances):
-        mask = _encode_image(
-            np.uint8(np.random.rand(image_height, image_width) * 255),
-            fmt='PNG')
-        masks.append(mask)
-    serialized_example = tf.train.Example(
-        features=tf.train.Features(
-            feature={
-                'image/encoded': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=[image]))),
-                'image/source_id': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
-                'image/height': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=[image_height]))),
-                'image/width': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=[image_width]))),
-                'image/object/bbox/xmin': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=xmins))),
-                'image/object/bbox/xmax': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=xmaxs))),
-                'image/object/bbox/ymin': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=ymins))),
-                'image/object/bbox/ymax': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=ymaxs))),
-                'image/object/class/text': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=labels))),
-                'image/object/is_crowd': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=is_crowds))),
-                'image/object/area': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=areas))),
-                'image/object/mask': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=masks))),
-            })).SerializeToString()
    decoded_tensors = decoder.decode(
        tf.convert_to_tensor(value=serialized_example))
@@ -131,7 +59,7 @@ class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllEqual(
        (image_height, image_width, 3), results['image'].shape)
-    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
    self.assertEqual(image_height, results['height'])
    self.assertEqual(image_width, results['width'])
    self.assertAllEqual(
@@ -162,7 +90,7 @@ class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
                     [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]],
                     [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]
-    image = _encode_image(np.uint8(image_content), fmt='PNG')
+    image = tfexample_utils.encode_image(np.uint8(image_content), fmt='PNG')
    image_height = 4
    image_width = 4
    num_instances = 2
@@ -183,46 +111,38 @@ class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
                     [0, 255, 255, 255],
                     [0, 255, 255, 255],
                     [0, 255, 255, 255]]]
-    masks = [_encode_image(np.uint8(m), fmt='PNG') for m in list(mask_content)]
+    masks = [
+        tfexample_utils.encode_image(np.uint8(m), fmt='PNG')
+        for m in list(mask_content)
+    ]
    serialized_example = tf.train.Example(
        features=tf.train.Features(
            feature={
-                'image/encoded': (
+                'image/encoded': (tf.train.Feature(
-                    tf.train.Feature(
+                    bytes_list=tf.train.BytesList(value=[image]))),
-                        bytes_list=tf.train.BytesList(value=[image]))),
+                'image/source_id': (tf.train.Feature(
-                'image/source_id': (
+                    bytes_list=tf.train.BytesList(
-                    tf.train.Feature(
+                        value=[tfexample_utils.DUMP_SOURCE_ID]))),
-                        bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+                'image/height': (tf.train.Feature(
-                'image/height': (
+                    int64_list=tf.train.Int64List(value=[image_height]))),
-                    tf.train.Feature(
+                'image/width': (tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=[image_height]))),
+                    int64_list=tf.train.Int64List(value=[image_width]))),
-                'image/width': (
+                'image/object/bbox/xmin': (tf.train.Feature(
-                    tf.train.Feature(
+                    float_list=tf.train.FloatList(value=xmins))),
-                        int64_list=tf.train.Int64List(value=[image_width]))),
+                'image/object/bbox/xmax': (tf.train.Feature(
-                'image/object/bbox/xmin': (
+                    float_list=tf.train.FloatList(value=xmaxs))),
-                    tf.train.Feature(
+                'image/object/bbox/ymin': (tf.train.Feature(
-                        float_list=tf.train.FloatList(value=xmins))),
+                    float_list=tf.train.FloatList(value=ymins))),
-                'image/object/bbox/xmax': (
+                'image/object/bbox/ymax': (tf.train.Feature(
-                    tf.train.Feature(
+                    float_list=tf.train.FloatList(value=ymaxs))),
-                        float_list=tf.train.FloatList(value=xmaxs))),
+                'image/object/class/text': (tf.train.Feature(
-                'image/object/bbox/ymin': (
+                    bytes_list=tf.train.BytesList(value=labels))),
-                    tf.train.Feature(
+                'image/object/is_crowd': (tf.train.Feature(
-                        float_list=tf.train.FloatList(value=ymins))),
+                    int64_list=tf.train.Int64List(value=is_crowds))),
-                'image/object/bbox/ymax': (
+                'image/object/area': (tf.train.Feature(
-                    tf.train.Feature(
+                    float_list=tf.train.FloatList(value=areas))),
-                        float_list=tf.train.FloatList(value=ymaxs))),
+                'image/object/mask': (tf.train.Feature(
-                'image/object/class/text': (
+                    bytes_list=tf.train.BytesList(value=masks))),
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=labels))),
-                'image/object/is_crowd': (
-                    tf.train.Feature(
-                        int64_list=tf.train.Int64List(value=is_crowds))),
-                'image/object/area': (
-                    tf.train.Feature(
-                        float_list=tf.train.FloatList(value=areas))),
-                'image/object/mask': (
-                    tf.train.Feature(
-                        bytes_list=tf.train.BytesList(value=masks))),
            })).SerializeToString()
    decoded_tensors = decoder.decode(
        tf.convert_to_tensor(value=serialized_example))
@@ -232,7 +152,7 @@ class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllEqual(
        (image_height, image_width, 3), results['image'].shape)
    self.assertAllEqual(image_content, results['image'])
-    self.assertEqual(DUMP_SOURCE_ID, results['source_id'])
+    self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id'])
    self.assertEqual(image_height, results['height'])
    self.assertEqual(image_width, results['width'])
    self.assertAllEqual(

--- a/official/vision/beta/dataloaders/tfds_factory_test.py
+++ b/official/vision/beta/dataloaders/tfds_factory_test.py
@@ -23,6 +23,22 @@ from official.vision.beta.dataloaders import tfds_factory
 class TFDSFactoryTest(tf.test.TestCase, parameterized.TestCase):
+  def _create_test_example(self):
+    serialized_example = {
+        'image': tf.ones(shape=(100, 100, 3), dtype=tf.uint8),
+        'label': 1,
+        'image/id': 0,
+        'objects': {
+            'label': 1,
+            'is_crowd': 0,
+            'area': 0.5,
+            'bbox': [0.1, 0.2, 0.3, 0.4]
+        },
+        'segmentation_label': tf.ones((100, 100, 1), dtype=tf.uint8),
+        'image_left': tf.ones(shape=(100, 100, 3), dtype=tf.uint8)
+    }
+    return serialized_example
  @parameterized.parameters(
      ('imagenet2012'),
      ('cifar10'),
@@ -31,6 +47,10 @@ class TFDSFactoryTest(tf.test.TestCase, parameterized.TestCase):
  def test_classification_decoder(self, tfds_name):
    decoder = tfds_factory.get_classification_decoder(tfds_name)
    self.assertIsInstance(decoder, base_decoder.Decoder)
+    decoded_tensor = decoder.decode(self._create_test_example())
+    self.assertLen(decoded_tensor, 2)
+    self.assertIn('image/encoded', decoded_tensor)
+    self.assertIn('image/class/label', decoded_tensor)
  @parameterized.parameters(
      ('flowers'),
@@ -48,6 +68,16 @@ class TFDSFactoryTest(tf.test.TestCase, parameterized.TestCase):
  def test_detection_decoder(self, tfds_name):
    decoder = tfds_factory.get_detection_decoder(tfds_name)
    self.assertIsInstance(decoder, base_decoder.Decoder)
+    decoded_tensor = decoder.decode(self._create_test_example())
+    self.assertLen(decoded_tensor, 8)
+    self.assertIn('image', decoded_tensor)
+    self.assertIn('source_id', decoded_tensor)
+    self.assertIn('height', decoded_tensor)
+    self.assertIn('width', decoded_tensor)
+    self.assertIn('groundtruth_classes', decoded_tensor)
+    self.assertIn('groundtruth_is_crowd', decoded_tensor)
+    self.assertIn('groundtruth_area', decoded_tensor)
+    self.assertIn('groundtruth_boxes', decoded_tensor)
  @parameterized.parameters(
      ('pascal'),
@@ -65,6 +95,12 @@ class TFDSFactoryTest(tf.test.TestCase, parameterized.TestCase):
  def test_segmentation_decoder(self, tfds_name):
    decoder = tfds_factory.get_segmentation_decoder(tfds_name)
    self.assertIsInstance(decoder, base_decoder.Decoder)
+    decoded_tensor = decoder.decode(self._create_test_example())
+    self.assertLen(decoded_tensor, 4)
+    self.assertIn('image/encoded', decoded_tensor)
+    self.assertIn('image/segmentation/class/encoded', decoded_tensor)
+    self.assertIn('image/height', decoded_tensor)
+    self.assertIn('image/width', decoded_tensor)
  @parameterized.parameters(
      ('coco'),

--- a/official/vision/beta/dataloaders/tfexample_utils.py
+++ b/official/vision/beta/dataloaders/tfexample_utils.py
@@ -54,16 +54,20 @@ IMAGE_KEY = 'image/encoded'
 CLASSIFICATION_LABEL_KEY = 'image/class/label'
 LABEL_KEY = 'clip/label/index'
 AUDIO_KEY = 'features/audio'
+DUMP_SOURCE_ID = b'123'
-def make_image_bytes(shape: Sequence[int]):
+def encode_image(image_array: np.array, fmt: str) -> bytes:
-  """Generates image and return bytes in JPEG format."""
+  image = Image.fromarray(image_array)
+  with io.BytesIO() as output:
+    image.save(output, format=fmt)
+    return output.getvalue()
+def make_image_bytes(shape: Sequence[int], fmt: str = 'JPEG') -> bytes:
+  """Generates image and return bytes in specified format."""
  random_image = np.random.randint(0, 256, size=shape, dtype=np.uint8)
-  random_image = Image.fromarray(random_image)
+  return encode_image(random_image, fmt=fmt)
-  with io.BytesIO() as buffer:
-    random_image.save(buffer, format='JPEG')
-    raw_image_bytes = buffer.getvalue()
-  return raw_image_bytes
 def put_int64_to_context(seq_example: tf.train.SequenceExample,
@@ -164,3 +168,102 @@ def create_3d_image_test_example(image_height: int, image_width: int,
          bytes_list=tf.train.BytesList(value=[labels.tobytes()])))
  }
  return tf.train.Example(features=tf.train.Features(feature=feature))
+def create_detection_test_example(image_height: int, image_width: int,
+                                  image_channel: int,
+                                  num_instances: int) -> tf.train.Example:
+  """Creates and returns a test example containing box and mask annotations.
+  Args:
+    image_height: The height of test image.
+    image_width: The width of test image.
+    image_channel: The channel of test image.
+    num_instances: The number of object instances per image.
+  Returns:
+    A tf.train.Example for testing.
+  """
+  image = make_image_bytes([image_height, image_width, image_channel])
+  if num_instances == 0:
+    xmins = []
+    xmaxs = []
+    ymins = []
+    ymaxs = []
+    labels = []
+    areas = []
+    is_crowds = []
+    masks = []
+    labels_text = []
+  else:
+    xmins = list(np.random.rand(num_instances))
+    xmaxs = list(np.random.rand(num_instances))
+    ymins = list(np.random.rand(num_instances))
+    ymaxs = list(np.random.rand(num_instances))
+    labels_text = [b'class_1'] * num_instances
+    labels = list(np.random.randint(100, size=num_instances))
+    areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width
+             for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)]
+    is_crowds = [0] * num_instances
+    masks = []
+    for _ in range(num_instances):
+      mask = make_image_bytes([image_height, image_width], fmt='PNG')
+      masks.append(mask)
+  return tf.train.Example(
+      features=tf.train.Features(
+          feature={
+              'image/encoded': (tf.train.Feature(
+                  bytes_list=tf.train.BytesList(value=[image]))),
+              'image/source_id': (tf.train.Feature(
+                  bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))),
+              'image/height': (tf.train.Feature(
+                  int64_list=tf.train.Int64List(value=[image_height]))),
+              'image/width': (tf.train.Feature(
+                  int64_list=tf.train.Int64List(value=[image_width]))),
+              'image/object/bbox/xmin': (tf.train.Feature(
+                  float_list=tf.train.FloatList(value=xmins))),
+              'image/object/bbox/xmax': (tf.train.Feature(
+                  float_list=tf.train.FloatList(value=xmaxs))),
+              'image/object/bbox/ymin': (tf.train.Feature(
+                  float_list=tf.train.FloatList(value=ymins))),
+              'image/object/bbox/ymax': (tf.train.Feature(
+                  float_list=tf.train.FloatList(value=ymaxs))),
+              'image/object/class/label': (tf.train.Feature(
+                  int64_list=tf.train.Int64List(value=labels))),
+              'image/object/class/text': (tf.train.Feature(
+                  bytes_list=tf.train.BytesList(value=labels_text))),
+              'image/object/is_crowd': (tf.train.Feature(
+                  int64_list=tf.train.Int64List(value=is_crowds))),
+              'image/object/area': (tf.train.Feature(
+                  float_list=tf.train.FloatList(value=areas))),
+              'image/object/mask': (tf.train.Feature(
+                  bytes_list=tf.train.BytesList(value=masks))),
+          }))
+def create_segmentation_test_example(image_height: int, image_width: int,
+                                     image_channel: int) -> tf.train.Example:
+  """Creates and returns a test example containing mask annotations.
+  Args:
+    image_height: The height of test image.
+    image_width: The width of test image.
+    image_channel: The channel of test image.
+  Returns:
+    A tf.train.Example for testing.
+  """
+  image = make_image_bytes([image_height, image_width, image_channel])
+  mask = make_image_bytes([image_height, image_width], fmt='PNG')
+  return tf.train.Example(
+      features=tf.train.Features(
+          feature={
+              'image/encoded': (tf.train.Feature(
+                  bytes_list=tf.train.BytesList(value=[image]))),
+              'image/segmentation/class/encoded': (tf.train.Feature(
+                  bytes_list=tf.train.BytesList(value=[mask]))),
+              'image/height': (tf.train.Feature(
+                  int64_list=tf.train.Int64List(value=[image_height]))),
+              'image/width': (tf.train.Feature(
+                  int64_list=tf.train.Int64List(value=[image_width])))
+          }))
--- a/official/vision/beta/dataloaders/utils.py
+++ b/official/vision/beta/dataloaders/utils.py
@@ -31,7 +31,7 @@ def process_source_id(source_id: tf.Tensor) -> tf.Tensor:
    A formatted source ID.
  """
  if source_id.dtype == tf.string:
-    source_id = tf.cast(tf.strings.to_number(source_id), tf.int64)
+    source_id = tf.strings.to_number(source_id, tf.int64)
  with tf.control_dependencies([source_id]):
    source_id = tf.cond(
        pred=tf.equal(tf.size(input=source_id), 0),

--- a/official/vision/beta/dataloaders/video_input.py
+++ b/official/vision/beta/dataloaders/video_input.py
@@ -361,7 +361,7 @@ class Parser(parser.Parser):
      audio = decoded_tensors[self._audio_feature]
      audio = tf.cast(audio, dtype=self._dtype)
      audio = preprocess_ops_3d.sample_sequence(
-          audio, 20, random=False, stride=1)
+          audio, self._audio_shape[0], random=False, stride=1)
      audio = tf.ensure_shape(audio, self._audio_shape)
      features['audio'] = audio

--- a/official/vision/beta/evaluation/coco_utils.py
+++ b/official/vision/beta/evaluation/coco_utils.py
@@ -212,6 +212,8 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
  gt_annotations = []
  num_batches = len(groundtruths['source_id'])
  for i in range(num_batches):
+    logging.info(
+        'convert_groundtruths_to_coco_dataset: Processing annotation %d', i)
    max_num_instances = groundtruths['classes'][i].shape[1]
    batch_size = groundtruths['source_id'][i].shape[0]
    for j in range(batch_size):
@@ -259,6 +261,10 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
          np_mask[np_mask > 0] = 255
          encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
          ann['segmentation'] = encoded_mask
+          # Ensure the content of `counts` is JSON serializable string.
+          if 'counts' in ann['segmentation']:
+            ann['segmentation']['counts'] = six.ensure_str(
+                ann['segmentation']['counts'])
          if 'areas' not in groundtruths:
            ann['area'] = mask_api.area(encoded_mask)
        gt_annotations.append(ann)
@@ -283,11 +289,13 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
 class COCOGroundtruthGenerator:
  """Generates the groundtruth annotations from a single example."""
-  def __init__(self, file_pattern, file_type, num_examples, include_mask):
+  def __init__(self, file_pattern, file_type, num_examples, include_mask,
+               regenerate_source_id=False):
    self._file_pattern = file_pattern
    self._num_examples = num_examples
    self._include_mask = include_mask
    self._dataset_fn = dataset_fn.pick_dataset_fn(file_type)
+    self._regenerate_source_id = regenerate_source_id
  def _parse_single_example(self, example):
    """Parses a single serialized tf.Example proto.
@@ -312,16 +320,21 @@ class COCOGroundtruthGenerator:
          mask of each instance.
    """
    decoder = tf_example_decoder.TfExampleDecoder(
-        include_mask=self._include_mask)
+        include_mask=self._include_mask,
+        regenerate_source_id=self._regenerate_source_id)
    decoded_tensors = decoder.decode(example)
    image = decoded_tensors['image']
    image_size = tf.shape(image)[0:2]
    boxes = box_ops.denormalize_boxes(
        decoded_tensors['groundtruth_boxes'], image_size)
+    source_id = decoded_tensors['source_id']
+    if source_id.dtype is tf.string:
+      source_id = tf.strings.to_number(source_id, out_type=tf.int64)
    groundtruths = {
-        'source_id': tf.strings.to_number(
+        'source_id': source_id,
-            decoded_tensors['source_id'], out_type=tf.int64),
        'height': decoded_tensors['height'],
        'width': decoded_tensors['width'],
        'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0],
@@ -341,9 +354,10 @@ class COCOGroundtruthGenerator:
    dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
    dataset = dataset.interleave(
        map_func=lambda filename: self._dataset_fn(filename).prefetch(1),
-        cycle_length=12,
+        cycle_length=None,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    dataset = dataset.take(self._num_examples)
    dataset = dataset.map(self._parse_single_example,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(1, drop_remainder=False)
@@ -351,18 +365,18 @@ class COCOGroundtruthGenerator:
    return dataset
  def __call__(self):
-    for groundtruth_result in self._build_pipeline():
+    return self._build_pipeline()
-      yield groundtruth_result
 def scan_and_generator_annotation_file(file_pattern: str,
                                       file_type: str,
                                       num_samples: int,
                                       include_mask: bool,
-                                       annotation_file: str):
+                                       annotation_file: str,
+                                       regenerate_source_id: bool = False):
  """Scans and generate the COCO-style annotation JSON file given a dataset."""
  groundtruth_generator = COCOGroundtruthGenerator(
-      file_pattern, file_type, num_samples, include_mask)
+      file_pattern, file_type, num_samples, include_mask, regenerate_source_id)
  generate_annotation_file(groundtruth_generator, annotation_file)
@@ -371,7 +385,8 @@ def generate_annotation_file(groundtruth_generator,
  """Generates COCO-style annotation JSON file given a groundtruth generator."""
  groundtruths = {}
  logging.info('Loading groundtruth annotations from dataset to memory...')
-  for groundtruth in groundtruth_generator():
+  for i, groundtruth in enumerate(groundtruth_generator()):
+    logging.info('generate_annotation_file: Processing annotation %d', i)
    for k, v in six.iteritems(groundtruth):
      if k not in groundtruths:
        groundtruths[k] = [v]

--- a/official/vision/beta/evaluation/coco_utils_test.py
+++ b/official/vision/beta/evaluation/coco_utils_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for coco_utils."""
+import os
+import tensorflow as tf
+from official.vision.beta.dataloaders import tfexample_utils
+from official.vision.beta.evaluation import coco_utils
+class CocoUtilsTest(tf.test.TestCase):
+  def test_scan_and_generator_annotation_file(self):
+    num_samples = 10
+    example = tfexample_utils.create_detection_test_example(
+        image_height=512, image_width=512, image_channel=3, num_instances=10)
+    tf_examples = [example] * num_samples
+    data_file = os.path.join(self.create_tempdir(), 'test.tfrecord')
+    tfexample_utils.dump_to_tfrecord(
+        record_file=data_file, tf_examples=tf_examples)
+    annotation_file = os.path.join(self.create_tempdir(), 'annotation.json')
+    coco_utils.scan_and_generator_annotation_file(
+        file_pattern=data_file,
+        file_type='tfrecord',
+        num_samples=num_samples,
+        include_mask=True,
+        annotation_file=annotation_file)
+    self.assertTrue(
+        tf.io.gfile.exists(annotation_file),
+        msg='Annotation file {annotation_file} does not exists.')
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/evaluation/iou_test.py
+++ b/official/vision/beta/evaluation/iou_test.py
@@ -95,5 +95,21 @@ class MeanIoUTest(tf.test.TestCase):
    expected_result = [0, 1 / (1 + 1 - 1)]
    self.assertAllClose(expected_result, result, atol=1e-3)
+  def test_update_state_annd_result(self):
+    y_pred = [0, 1, 0, 1]
+    y_true = [0, 0, 1, 1]
+    m_obj = iou.PerClassIoU(num_classes=2)
+    m_obj.update_state(y_true, y_pred)
+    result = m_obj.result()
+    # cm = [[1, 1],
+    #       [1, 1]]
+    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = [1 / (2 + 2 - 1), 1 / (2 + 2 - 1)]
+    self.assertAllClose(expected_result, result, atol=1e-3)
 if __name__ == '__main__':
  tf.test.main()
--- a/official/vision/beta/evaluation/panoptic_quality_evaluator_test.py
+++ b/official/vision/beta/evaluation/panoptic_quality_evaluator_test.py
@@ -45,19 +45,25 @@ class PanopticQualityEvaluatorTest(tf.test.TestCase):
                                      dtype=np.uint16)
    groundtruths = {
-        'category_mask': tf.convert_to_tensor(category_mask),
+        'category_mask':
-        'instance_mask': tf.convert_to_tensor(groundtruth_instance_mask)
+            tf.convert_to_tensor([category_mask]),
+        'instance_mask':
+            tf.convert_to_tensor([groundtruth_instance_mask]),
+        'image_info':
+            tf.convert_to_tensor([[[6, 6], [6, 6], [1.0, 1.0], [0, 0]]],
+                                 dtype=tf.float32)
    }
    predictions = {
-        'category_mask': tf.convert_to_tensor(category_mask),
+        'category_mask': tf.convert_to_tensor([category_mask]),
-        'instance_mask': tf.convert_to_tensor(good_det_instance_mask)
+        'instance_mask': tf.convert_to_tensor([good_det_instance_mask])
    }
    pq_evaluator = panoptic_quality_evaluator.PanopticQualityEvaluator(
        num_categories=1,
        ignored_label=2,
        max_instances_per_category=16,
-        offset=16)
+        offset=16,
+        rescale_predictions=True)
    for _ in range(2):
      pq_evaluator.update_state(groundtruths, predictions)
@@ -70,7 +76,7 @@ class PanopticQualityEvaluatorTest(tf.test.TestCase):
        [1, 1, 1, 1, 1, 1],
    ],
                                     dtype=np.uint16)
-    predictions['instance_mask'] = tf.convert_to_tensor(bad_det_instance_mask)
+    predictions['instance_mask'] = tf.convert_to_tensor([bad_det_instance_mask])
    for _ in range(2):
      pq_evaluator.update_state(groundtruths, predictions)

--- a/official/vision/beta/evaluation/segmentation_metrics.py
+++ b/official/vision/beta/evaluation/segmentation_metrics.py
@@ -41,8 +41,7 @@ class MeanIoU(tf.keras.metrics.MeanIoU):
      dtype: data type of the metric result.
    """
    self._rescale_predictions = rescale_predictions
-    super(MeanIoU, self).__init__(
+    super().__init__(num_classes=num_classes, name=name, dtype=dtype)
-        num_classes=num_classes, name=name, dtype=dtype)
  def update_state(self, y_true, y_pred):
    """Updates metric state.
@@ -120,9 +119,8 @@ class MeanIoU(tf.keras.metrics.MeanIoU):
      flatten_masks = tf.reshape(masks, shape=[-1])
      flatten_valid_masks = tf.reshape(valid_masks, shape=[-1])
-      super(MeanIoU, self).update_state(
+      super().update_state(flatten_masks, flatten_predictions,
-          flatten_masks, flatten_predictions,
+                           tf.cast(flatten_valid_masks, tf.float32))
-          tf.cast(flatten_valid_masks, tf.float32))
 class PerClassIoU(iou.PerClassIoU):
@@ -148,8 +146,7 @@ class PerClassIoU(iou.PerClassIoU):
      dtype: data type of the metric result.
    """
    self._rescale_predictions = rescale_predictions
-    super(PerClassIoU, self).__init__(
+    super().__init__(num_classes=num_classes, name=name, dtype=dtype)
-        num_classes=num_classes, name=name, dtype=dtype)
  def update_state(self, y_true, y_pred):
    """Updates metric state.
@@ -213,9 +210,8 @@ class PerClassIoU(iou.PerClassIoU):
        flatten_predictions = tf.reshape(predicted_mask, shape=[1, -1])
        flatten_masks = tf.reshape(mask, shape=[1, -1])
        flatten_valid_masks = tf.reshape(valid_mask, shape=[1, -1])
-        super(PerClassIoU, self).update_state(
+        super().update_state(flatten_masks, flatten_predictions,
-            flatten_masks, flatten_predictions,
+                             tf.cast(flatten_valid_masks, tf.float32))
-            tf.cast(flatten_valid_masks, tf.float32))
    else:
      predictions = tf.image.resize(
@@ -227,6 +223,5 @@ class PerClassIoU(iou.PerClassIoU):
      flatten_masks = tf.reshape(masks, shape=[-1])
      flatten_valid_masks = tf.reshape(valid_masks, shape=[-1])
-      super(PerClassIoU, self).update_state(
+      super().update_state(flatten_masks, flatten_predictions,
-          flatten_masks, flatten_predictions,
+                           tf.cast(flatten_valid_masks, tf.float32))
-          tf.cast(flatten_valid_masks, tf.float32))
--- a/official/vision/beta/evaluation/segmentation_metrics_test.py
+++ b/official/vision/beta/evaluation/segmentation_metrics_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for segmentation_metrics."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from official.vision.beta.evaluation import segmentation_metrics
+class SegmentationMetricsTest(parameterized.TestCase, tf.test.TestCase):
+  def _create_test_data(self):
+    y_pred_cls0 = np.expand_dims(
+        np.array([[1, 1, 0], [1, 1, 0], [0, 0, 0]], dtype=np.uint16),
+        axis=(0, -1))
+    y_pred_cls1 = np.expand_dims(
+        np.array([[0, 0, 0], [0, 0, 1], [0, 0, 1]], dtype=np.uint16),
+        axis=(0, -1))
+    y_pred = np.concatenate((y_pred_cls0, y_pred_cls1), axis=-1)
+    y_true = {
+        'masks':
+            np.expand_dims(
+                np.array([[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
+                          [0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1],
+                          [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]],
+                         dtype=np.uint16),
+                axis=(0, -1)),
+        'valid_masks':
+            np.ones([1, 6, 6, 1], dtype=np.uint16),
+        'image_info':
+            np.array([[[6, 6], [3, 3], [0.5, 0.5], [0, 0]]], dtype=np.float32)
+    }
+    return y_pred, y_true
+  @parameterized.parameters(True, False)
+  def test_mean_iou_metric(self, rescale_predictions):
+    tf.config.experimental_run_functions_eagerly(True)
+    mean_iou_metric = segmentation_metrics.MeanIoU(
+        num_classes=2, rescale_predictions=rescale_predictions)
+    y_pred, y_true = self._create_test_data()
+    # Disable autograph for correct coverage statistics.
+    update_fn = tf.autograph.experimental.do_not_convert(
+        mean_iou_metric.update_state)
+    update_fn(y_true=y_true, y_pred=y_pred)
+    miou = mean_iou_metric.result()
+    self.assertAlmostEqual(miou.numpy(), 0.762, places=3)
+  @parameterized.parameters(True, False)
+  def test_per_class_mean_iou_metric(self, rescale_predictions):
+    per_class_iou_metric = segmentation_metrics.PerClassIoU(
+        num_classes=2, rescale_predictions=rescale_predictions)
+    y_pred, y_true = self._create_test_data()
+    # Disable autograph for correct coverage statistics.
+    update_fn = tf.autograph.experimental.do_not_convert(
+        per_class_iou_metric.update_state)
+    update_fn(y_true=y_true, y_pred=y_pred)
+    per_class_miou = per_class_iou_metric.result()
+    self.assertAllClose(per_class_miou.numpy(), [0.857, 0.667], atol=1e-3)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/losses/segmentation_losses.py
+++ b/official/vision/beta/losses/segmentation_losses.py
@@ -17,6 +17,8 @@
 # Import libraries
 import tensorflow as tf
+from official.modeling import tf_utils
 EPSILON = 1e-5
@@ -87,3 +89,46 @@ class SegmentationLoss:
      loss = tf.reduce_sum(top_k_losses) / normalizer
    return loss
+def get_actual_mask_scores(logits, labels, ignore_label):
+  """Gets actual mask scores."""
+  _, height, width, num_classes = logits.get_shape().as_list()
+  batch_size = tf.shape(logits)[0]
+  logits = tf.stop_gradient(logits)
+  labels = tf.image.resize(
+      labels, (height, width),
+      method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
+  predicted_labels = tf.argmax(logits, -1, output_type=tf.int32)
+  flat_predictions = tf.reshape(predicted_labels, [batch_size, -1])
+  flat_labels = tf.cast(tf.reshape(labels, [batch_size, -1]), tf.int32)
+  one_hot_predictions = tf.one_hot(
+      flat_predictions, num_classes, on_value=True, off_value=False)
+  one_hot_labels = tf.one_hot(
+      flat_labels, num_classes, on_value=True, off_value=False)
+  keep_mask = tf.not_equal(flat_labels, ignore_label)
+  keep_mask = tf.expand_dims(keep_mask, 2)
+  overlap = tf.logical_and(one_hot_predictions, one_hot_labels)
+  overlap = tf.logical_and(overlap, keep_mask)
+  overlap = tf.reduce_sum(tf.cast(overlap, tf.float32), axis=1)
+  union = tf.logical_or(one_hot_predictions, one_hot_labels)
+  union = tf.logical_and(union, keep_mask)
+  union = tf.reduce_sum(tf.cast(union, tf.float32), axis=1)
+  actual_scores = tf.divide(overlap, tf.maximum(union, EPSILON))
+  return actual_scores
+class MaskScoringLoss:
+  """Mask Scoring loss."""
+  def __init__(self, ignore_label):
+    self._ignore_label = ignore_label
+    self._mse_loss = tf.keras.losses.MeanSquaredError(
+        reduction=tf.keras.losses.Reduction.NONE)
+  def __call__(self, predicted_scores, logits, labels):
+    actual_scores = get_actual_mask_scores(logits, labels, self._ignore_label)
+    loss = tf_utils.safe_mean(self._mse_loss(actual_scores, predicted_scores))
+    return loss
--- a/official/vision/beta/modeling/backbones/__init__.py
+++ b/official/vision/beta/modeling/backbones/__init__.py
@@ -16,6 +16,7 @@
 """Backbones package definition."""
 from official.vision.beta.modeling.backbones.efficientnet import EfficientNet
+from official.vision.beta.modeling.backbones.mobiledet import MobileDet
 from official.vision.beta.modeling.backbones.mobilenet import MobileNet
 from official.vision.beta.modeling.backbones.resnet import ResNet
 from official.vision.beta.modeling.backbones.resnet_3d import ResNet3D

--- a/official/vision/beta/modeling/backbones/factory_test.py
+++ b/official/vision/beta/modeling/backbones/factory_test.py
@@ -189,6 +189,40 @@ class FactoryTest(tf.test.TestCase, parameterized.TestCase):
        norm_momentum=0.99,
        norm_epsilon=1e-5)
+  @combinations.generate(
+      combinations.combine(
+          model_id=[
+              'MobileDetCPU',
+              'MobileDetDSP',
+              'MobileDetEdgeTPU',
+              'MobileDetGPU'],
+          filter_size_scale=[1.0, 0.75],
+      ))
+  def test_mobiledet_creation(self, model_id, filter_size_scale):
+    """Test creation of Mobiledet models."""
+    network = backbones.MobileDet(
+        model_id=model_id,
+        filter_size_scale=filter_size_scale,
+        norm_momentum=0.99,
+        norm_epsilon=1e-5)
+    backbone_config = backbones_cfg.Backbone(
+        type='mobiledet',
+        mobiledet=backbones_cfg.MobileDet(
+            model_id=model_id, filter_size_scale=filter_size_scale))
+    norm_activation_config = common_cfg.NormActivation(
+        norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False)
+    factory_network = factory.build_backbone(
+        input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
+        backbone_config=backbone_config,
+        norm_activation_config=norm_activation_config)
+    network_config = network.get_config()
+    factory_network_config = factory_network.get_config()
+    self.assertEqual(network_config, factory_network_config)
 if __name__ == '__main__':
  tf.test.main()