Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

0225b135 · Srihari Humbarwadi · GitHub · 7479dbb8 · 4c571a3c · 0225b135
Unverified Commit 0225b135 authored Mar 05, 2022 by Srihari Humbarwadi Committed by GitHub Mar 05, 2022
20 changed files
--- a/official/vision/evaluation/coco_evaluator.py
+++ b/official/vision/evaluation/coco_evaluator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The COCO-style evaluator.
+
+The following snippet demonstrates the use of interfaces:
+
+  evaluator = COCOEvaluator(...)
+  for _ in range(num_evals):
+    for _ in range(num_batches_per_eval):
+      predictions, groundtruth = predictor.predict(...)  # pop a batch.
+      evaluator.update_state(groundtruths, predictions)
+    evaluator.result()  # finish one full eval and reset states.
+
+See also: https://github.com/cocodataset/cocoapi/
+"""
+
+import atexit
+import tempfile
+# Import libraries
+from absl import logging
+import numpy as np
+from pycocotools import cocoeval
+import six
+import tensorflow as tf
+
+from official.vision.evaluation import coco_utils
+
+
+class COCOEvaluator(object):
+  """COCO evaluation metric class."""
+
+  def __init__(self,
+               annotation_file,
+               include_mask,
+               need_rescale_bboxes=True,
+               per_category_metrics=False):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to COCO metrics_fn. The
+    _update_op() takes detections from each image and push them to
+    self.detections. The _evaluate() loads a JSON file in COCO annotation format
+    as the groundtruths and runs COCO evaluation.
+
+    Args:
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        If `annotation_file` is None, groundtruth annotations will be loaded
+        from the dataloader.
+      include_mask: a boolean to indicate whether or not to include the mask
+        eval.
+      need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back
+        to absolute values (`image_info` is needed in this case).
+      per_category_metrics: Whether to return per category metrics.
+    """
+    if annotation_file:
+      if annotation_file.startswith('gs://'):
+        _, local_val_json = tempfile.mkstemp(suffix='.json')
+        tf.io.gfile.remove(local_val_json)
+
+        tf.io.gfile.copy(annotation_file, local_val_json)
+        atexit.register(tf.io.gfile.remove, local_val_json)
+      else:
+        local_val_json = annotation_file
+      self._coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if include_mask else 'box'),
+          annotation_file=local_val_json)
+    self._annotation_file = annotation_file
+    self._include_mask = include_mask
+    self._per_category_metrics = per_category_metrics
+    self._metric_names = [
+        'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', 'ARmax10',
+        'ARmax100', 'ARs', 'ARm', 'ARl'
+    ]
+    self._required_prediction_fields = [
+        'source_id', 'num_detections', 'detection_classes', 'detection_scores',
+        'detection_boxes'
+    ]
+    self._need_rescale_bboxes = need_rescale_bboxes
+    if self._need_rescale_bboxes:
+      self._required_prediction_fields.append('image_info')
+    self._required_groundtruth_fields = [
+        'source_id', 'height', 'width', 'classes', 'boxes'
+    ]
+    if self._include_mask:
+      mask_metric_names = ['mask_' + x for x in self._metric_names]
+      self._metric_names.extend(mask_metric_names)
+      self._required_prediction_fields.extend(['detection_masks'])
+      self._required_groundtruth_fields.extend(['masks'])
+
+    self.reset_states()
+
+  @property
+  def name(self):
+    return 'coco_metric'
+
+  def reset_states(self):
+    """Resets internal states for a fresh run."""
+    self._predictions = {}
+    if not self._annotation_file:
+      self._groundtruths = {}
+
+  def result(self):
+    """Evaluates detection results, and reset_states."""
+    metric_dict = self.evaluate()
+    # Cleans up the internal variables in order for a fresh eval next time.
+    self.reset_states()
+    return metric_dict
+
+  def evaluate(self):
+    """Evaluates with detections from all images with COCO API.
+
+    Returns:
+      coco_metric: float numpy array with shape [24] representing the
+        coco-style evaluation metrics (box and mask).
+    """
+    if not self._annotation_file:
+      logging.info('There is no annotation_file in COCOEvaluator.')
+      gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset(
+          self._groundtruths)
+      coco_gt = coco_utils.COCOWrapper(
+          eval_type=('mask' if self._include_mask else 'box'),
+          gt_dataset=gt_dataset)
+    else:
+      logging.info('Using annotation file: %s', self._annotation_file)
+      coco_gt = self._coco_gt
+    coco_predictions = coco_utils.convert_predictions_to_coco_annotations(
+        self._predictions)
+    coco_dt = coco_gt.loadRes(predictions=coco_predictions)
+    image_ids = [ann['image_id'] for ann in coco_predictions]
+
+    coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox')
+    coco_eval.params.imgIds = image_ids
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm')
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      mask_coco_metrics = mcoco_eval.stats
+
+    if self._include_mask:
+      metrics = np.hstack((coco_metrics, mask_coco_metrics))
+    else:
+      metrics = coco_metrics
+
+    metrics_dict = {}
+    for i, name in enumerate(self._metric_names):
+      metrics_dict[name] = metrics[i].astype(np.float32)
+
+    # Adds metrics per category.
+    if self._per_category_metrics:
+      metrics_dict.update(self._retrieve_per_category_metrics(coco_eval))
+
+      if self._include_mask:
+        metrics_dict.update(self._retrieve_per_category_metrics(
+            mcoco_eval, prefix='mask'))
+
+    return metrics_dict
+
+  def _retrieve_per_category_metrics(self, coco_eval, prefix=''):
+    """Retrieves and per-category metrics and retuns them in a dict.
+
+    Args:
+      coco_eval: a cocoeval.COCOeval object containing evaluation data.
+      prefix: str, A string used to prefix metric names.
+
+    Returns:
+      metrics_dict: A dictionary with per category metrics.
+    """
+
+    metrics_dict = {}
+    if prefix:
+      prefix = prefix + ' '
+
+    if hasattr(coco_eval, 'category_stats'):
+      for category_index, category_id in enumerate(coco_eval.params.catIds):
+        if self._annotation_file:
+          coco_category = self._coco_gt.cats[category_id]
+          # if 'name' is available use it, otherwise use `id`
+          category_display_name = coco_category.get('name', category_id)
+        else:
+          category_display_name = category_id
+
+        metrics_dict[prefix + 'Precision mAP ByCategory/{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[0][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Precision mAP ByCategory@50IoU/{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[1][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Precision mAP ByCategory@75IoU/{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[2][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Precision mAP ByCategory (small) /{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[3][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Precision mAP ByCategory (medium) /{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[4][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Precision mAP ByCategory (large) /{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[5][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Recall AR@1 ByCategory/{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[6][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Recall AR@10 ByCategory/{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[7][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Recall AR@100 ByCategory/{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[8][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Recall AR (small) ByCategory/{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[9][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Recall AR (medium) ByCategory/{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[10][category_index].astype(np.float32)
+        metrics_dict[prefix + 'Recall AR (large) ByCategory/{}'.format(
+            category_display_name
+        )] = coco_eval.category_stats[11][category_index].astype(np.float32)
+
+    return metrics_dict
+
+  def _process_predictions(self, predictions):
+    image_scale = np.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2))
+    predictions['detection_boxes'] = (
+        predictions['detection_boxes'].astype(np.float32))
+    predictions['detection_boxes'] /= image_scale
+    if 'detection_outer_boxes' in predictions:
+      predictions['detection_outer_boxes'] = (
+          predictions['detection_outer_boxes'].astype(np.float32))
+      predictions['detection_outer_boxes'] /= image_scale
+
+  def _convert_to_numpy(self, groundtruths, predictions):
+    """Converts tesnors to numpy arrays."""
+    if groundtruths:
+      labels = tf.nest.map_structure(lambda x: x.numpy(), groundtruths)
+      numpy_groundtruths = {}
+      for key, val in labels.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_groundtruths[key] = val
+    else:
+      numpy_groundtruths = groundtruths
+
+    if predictions:
+      outputs = tf.nest.map_structure(lambda x: x.numpy(), predictions)
+      numpy_predictions = {}
+      for key, val in outputs.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_predictions[key] = val
+    else:
+      numpy_predictions = predictions
+
+    return numpy_groundtruths, numpy_predictions
+
+  def update_state(self, groundtruths, predictions):
+    """Update and aggregate detection results and groundtruth data.
+
+    Args:
+      groundtruths: a dictionary of Tensors including the fields below.
+        See also different parsers under `../dataloader` for more details.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - height: a numpy array of int of shape [batch_size].
+          - width: a numpy array of int of shape [batch_size].
+          - num_detections: a numpy array of int of shape [batch_size].
+          - boxes: a numpy array of float of shape [batch_size, K, 4].
+          - classes: a numpy array of int of shape [batch_size, K].
+        Optional fields:
+          - is_crowds: a numpy array of int of shape [batch_size, K]. If the
+              field is absent, it is assumed that this instance is not crowd.
+          - areas: a numy array of float of shape [batch_size, K]. If the
+              field is absent, the area is calculated using either boxes or
+              masks depending on which one is available.
+          - masks: a numpy array of float of shape
+              [batch_size, K, mask_height, mask_width],
+      predictions: a dictionary of tensors including the fields below.
+        See different parsers under `../dataloader` for more details.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - image_info [if `need_rescale_bboxes` is True]: a numpy array of
+            float of shape [batch_size, 4, 2].
+          - num_detections: a numpy array of
+            int of shape [batch_size].
+          - detection_boxes: a numpy array of float of shape [batch_size, K, 4].
+          - detection_classes: a numpy array of int of shape [batch_size, K].
+          - detection_scores: a numpy array of float of shape [batch_size, K].
+        Optional fields:
+          - detection_masks: a numpy array of float of shape
+              [batch_size, K, mask_height, mask_width].
+    Raises:
+      ValueError: if the required prediction or groundtruth fields are not
+        present in the incoming `predictions` or `groundtruths`.
+    """
+    groundtruths, predictions = self._convert_to_numpy(groundtruths,
+                                                       predictions)
+    for k in self._required_prediction_fields:
+      if k not in predictions:
+        raise ValueError(
+            'Missing the required key `{}` in predictions!'.format(k))
+    if self._need_rescale_bboxes:
+      self._process_predictions(predictions)
+    for k, v in six.iteritems(predictions):
+      if k not in self._predictions:
+        self._predictions[k] = [v]
+      else:
+        self._predictions[k].append(v)
+
+    if not self._annotation_file:
+      assert groundtruths
+      for k in self._required_groundtruth_fields:
+        if k not in groundtruths:
+          raise ValueError(
+              'Missing the required key `{}` in groundtruths!'.format(k))
+      for k, v in six.iteritems(groundtruths):
+        if k not in self._groundtruths:
+          self._groundtruths[k] = [v]
+        else:
+          self._groundtruths[k].append(v)
--- a/official/vision/evaluation/coco_utils.py
+++ b/official/vision/evaluation/coco_utils.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Util functions related to pycocotools and COCO eval."""
+
+import copy
+import json
+
+# Import libraries
+
+from absl import logging
+import numpy as np
+from PIL import Image
+from pycocotools import coco
+from pycocotools import mask as mask_api
+import six
+import tensorflow as tf
+
+from official.common import dataset_fn
+from official.vision.dataloaders import tf_example_decoder
+from official.vision.ops import box_ops
+from official.vision.ops import mask_ops
+
+
+class COCOWrapper(coco.COCO):
+  """COCO wrapper class.
+
+  This class wraps COCO API object, which provides the following additional
+  functionalities:
+    1. Support string type image id.
+    2. Support loading the groundtruth dataset using the external annotation
+       dictionary.
+    3. Support loading the prediction results using the external annotation
+       dictionary.
+  """
+
+  def __init__(self, eval_type='box', annotation_file=None, gt_dataset=None):
+    """Instantiates a COCO-style API object.
+
+    Args:
+      eval_type: either 'box' or 'mask'.
+      annotation_file: a JSON file that stores annotations of the eval dataset.
+        This is required if `gt_dataset` is not provided.
+      gt_dataset: the groundtruth eval datatset in COCO API format.
+    """
+    if ((annotation_file and gt_dataset) or
+        ((not annotation_file) and (not gt_dataset))):
+      raise ValueError('One and only one of `annotation_file` and `gt_dataset` '
+                       'needs to be specified.')
+
+    if eval_type not in ['box', 'mask']:
+      raise ValueError('The `eval_type` can only be either `box` or `mask`.')
+
+    coco.COCO.__init__(self, annotation_file=annotation_file)
+    self._eval_type = eval_type
+    if gt_dataset:
+      self.dataset = gt_dataset
+      self.createIndex()
+
+  def loadRes(self, predictions):
+    """Loads result file and return a result api object.
+
+    Args:
+      predictions: a list of dictionary each representing an annotation in COCO
+        format. The required fields are `image_id`, `category_id`, `score`,
+        `bbox`, `segmentation`.
+
+    Returns:
+      res: result COCO api object.
+
+    Raises:
+      ValueError: if the set of image id from predctions is not the subset of
+        the set of image id of the groundtruth dataset.
+    """
+    res = coco.COCO()
+    res.dataset['images'] = copy.deepcopy(self.dataset['images'])
+    res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+
+    image_ids = [ann['image_id'] for ann in predictions]
+    if set(image_ids) != (set(image_ids) & set(self.getImgIds())):
+      raise ValueError('Results do not correspond to the current dataset!')
+    for ann in predictions:
+      x1, x2, y1, y2 = [ann['bbox'][0], ann['bbox'][0] + ann['bbox'][2],
+                        ann['bbox'][1], ann['bbox'][1] + ann['bbox'][3]]
+      if self._eval_type == 'box':
+        ann['area'] = ann['bbox'][2] * ann['bbox'][3]
+        ann['segmentation'] = [
+            [x1, y1, x1, y2, x2, y2, x2, y1]]
+      elif self._eval_type == 'mask':
+        ann['area'] = mask_api.area(ann['segmentation'])
+
+    res.dataset['annotations'] = copy.deepcopy(predictions)
+    res.createIndex()
+    return res
+
+
+def convert_predictions_to_coco_annotations(predictions):
+  """Converts a batch of predictions to annotations in COCO format.
+
+  Args:
+    predictions: a dictionary of lists of numpy arrays including the following
+      fields. K below denotes the maximum number of instances per image.
+      Required fields:
+        - source_id: a list of numpy arrays of int or string of shape
+            [batch_size].
+        - num_detections: a list of numpy arrays of int of shape [batch_size].
+        - detection_boxes: a list of numpy arrays of float of shape
+            [batch_size, K, 4], where coordinates are in the original image
+            space (not the scaled image space).
+        - detection_classes: a list of numpy arrays of int of shape
+            [batch_size, K].
+        - detection_scores: a list of numpy arrays of float of shape
+            [batch_size, K].
+      Optional fields:
+        - detection_masks: a list of numpy arrays of float of shape
+            [batch_size, K, mask_height, mask_width].
+
+  Returns:
+    coco_predictions: prediction in COCO annotation format.
+  """
+  coco_predictions = []
+  num_batches = len(predictions['source_id'])
+  max_num_detections = predictions['detection_classes'][0].shape[1]
+  use_outer_box = 'detection_outer_boxes' in predictions
+  for i in range(num_batches):
+    predictions['detection_boxes'][i] = box_ops.yxyx_to_xywh(
+        predictions['detection_boxes'][i])
+    if use_outer_box:
+      predictions['detection_outer_boxes'][i] = box_ops.yxyx_to_xywh(
+          predictions['detection_outer_boxes'][i])
+      mask_boxes = predictions['detection_outer_boxes']
+    else:
+      mask_boxes = predictions['detection_boxes']
+
+    batch_size = predictions['source_id'][i].shape[0]
+    for j in range(batch_size):
+      if 'detection_masks' in predictions:
+        image_masks = mask_ops.paste_instance_masks(
+            predictions['detection_masks'][i][j],
+            mask_boxes[i][j],
+            int(predictions['image_info'][i][j, 0, 0]),
+            int(predictions['image_info'][i][j, 0, 1]))
+        binary_masks = (image_masks > 0.0).astype(np.uint8)
+        encoded_masks = [
+            mask_api.encode(np.asfortranarray(binary_mask))
+            for binary_mask in list(binary_masks)]
+      for k in range(max_num_detections):
+        ann = {}
+        ann['image_id'] = predictions['source_id'][i][j]
+        ann['category_id'] = predictions['detection_classes'][i][j, k]
+        ann['bbox'] = predictions['detection_boxes'][i][j, k]
+        ann['score'] = predictions['detection_scores'][i][j, k]
+        if 'detection_masks' in predictions:
+          ann['segmentation'] = encoded_masks[k]
+        coco_predictions.append(ann)
+
+  for i, ann in enumerate(coco_predictions):
+    ann['id'] = i + 1
+
+  return coco_predictions
+
+
+def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
+  """Converts groundtruths to the dataset in COCO format.
+
+  Args:
+    groundtruths: a dictionary of numpy arrays including the fields below.
+      Note that each element in the list represent the number for a single
+      example without batch dimension. K below denotes the actual number of
+      instances for each image.
+      Required fields:
+        - source_id: a list of numpy arrays of int or string of shape
+          [batch_size].
+        - height: a list of numpy arrays of int of shape [batch_size].
+        - width: a list of numpy arrays of int of shape [batch_size].
+        - num_detections: a list of numpy arrays of int of shape [batch_size].
+        - boxes: a list of numpy arrays of float of shape [batch_size, K, 4],
+            where coordinates are in the original image space (not the
+            normalized coordinates).
+        - classes: a list of numpy arrays of int of shape [batch_size, K].
+      Optional fields:
+        - is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If
+            th field is absent, it is assumed that this instance is not crowd.
+        - areas: a list of numy arrays of float of shape [batch_size, K]. If the
+            field is absent, the area is calculated using either boxes or
+            masks depending on which one is available.
+        - masks: a list of numpy arrays of string of shape [batch_size, K],
+    label_map: (optional) a dictionary that defines items from the category id
+      to the category name. If `None`, collect the category mappping from the
+      `groundtruths`.
+
+  Returns:
+    coco_groundtruths: the groundtruth dataset in COCO format.
+  """
+  source_ids = np.concatenate(groundtruths['source_id'], axis=0)
+  heights = np.concatenate(groundtruths['height'], axis=0)
+  widths = np.concatenate(groundtruths['width'], axis=0)
+  gt_images = [{'id': int(i), 'height': int(h), 'width': int(w)} for i, h, w
+               in zip(source_ids, heights, widths)]
+
+  gt_annotations = []
+  num_batches = len(groundtruths['source_id'])
+  for i in range(num_batches):
+    logging.info(
+        'convert_groundtruths_to_coco_dataset: Processing annotation %d', i)
+    max_num_instances = groundtruths['classes'][i].shape[1]
+    batch_size = groundtruths['source_id'][i].shape[0]
+    for j in range(batch_size):
+      num_instances = groundtruths['num_detections'][i][j]
+      if num_instances > max_num_instances:
+        logging.warning(
+            'num_groundtruths is larger than max_num_instances, %d v.s. %d',
+            num_instances, max_num_instances)
+        num_instances = max_num_instances
+      for k in range(int(num_instances)):
+        ann = {}
+        ann['image_id'] = int(groundtruths['source_id'][i][j])
+        if 'is_crowds' in groundtruths:
+          ann['iscrowd'] = int(groundtruths['is_crowds'][i][j, k])
+        else:
+          ann['iscrowd'] = 0
+        ann['category_id'] = int(groundtruths['classes'][i][j, k])
+        boxes = groundtruths['boxes'][i]
+        ann['bbox'] = [
+            float(boxes[j, k, 1]),
+            float(boxes[j, k, 0]),
+            float(boxes[j, k, 3] - boxes[j, k, 1]),
+            float(boxes[j, k, 2] - boxes[j, k, 0])]
+        if 'areas' in groundtruths:
+          ann['area'] = float(groundtruths['areas'][i][j, k])
+        else:
+          ann['area'] = float(
+              (boxes[j, k, 3] - boxes[j, k, 1]) *
+              (boxes[j, k, 2] - boxes[j, k, 0]))
+        if 'masks' in groundtruths:
+          if isinstance(groundtruths['masks'][i][j, k], tf.Tensor):
+            mask = Image.open(
+                six.BytesIO(groundtruths['masks'][i][j, k].numpy()))
+            width, height = mask.size
+            np_mask = (
+                np.array(mask.getdata()).reshape(height,
+                                                 width).astype(np.uint8))
+          else:
+            mask = Image.open(
+                six.BytesIO(groundtruths['masks'][i][j, k]))
+            width, height = mask.size
+            np_mask = (
+                np.array(mask.getdata()).reshape(height,
+                                                 width).astype(np.uint8))
+          np_mask[np_mask > 0] = 255
+          encoded_mask = mask_api.encode(np.asfortranarray(np_mask))
+          ann['segmentation'] = encoded_mask
+          # Ensure the content of `counts` is JSON serializable string.
+          if 'counts' in ann['segmentation']:
+            ann['segmentation']['counts'] = six.ensure_str(
+                ann['segmentation']['counts'])
+          if 'areas' not in groundtruths:
+            ann['area'] = mask_api.area(encoded_mask)
+        gt_annotations.append(ann)
+
+  for i, ann in enumerate(gt_annotations):
+    ann['id'] = i + 1
+
+  if label_map:
+    gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map]
+  else:
+    category_ids = [gt['category_id'] for gt in gt_annotations]
+    gt_categories = [{'id': i} for i in set(category_ids)]
+
+  gt_dataset = {
+      'images': gt_images,
+      'categories': gt_categories,
+      'annotations': copy.deepcopy(gt_annotations),
+  }
+  return gt_dataset
+
+
+class COCOGroundtruthGenerator:
+  """Generates the groundtruth annotations from a single example."""
+
+  def __init__(self, file_pattern, file_type, num_examples, include_mask,
+               regenerate_source_id=False):
+    self._file_pattern = file_pattern
+    self._num_examples = num_examples
+    self._include_mask = include_mask
+    self._dataset_fn = dataset_fn.pick_dataset_fn(file_type)
+    self._regenerate_source_id = regenerate_source_id
+
+  def _parse_single_example(self, example):
+    """Parses a single serialized tf.Example proto.
+
+    Args:
+      example: a serialized tf.Example proto string.
+
+    Returns:
+      A dictionary of groundtruth with the following fields:
+        source_id: a scalar tensor of int64 representing the image source_id.
+        height: a scalar tensor of int64 representing the image height.
+        width: a scalar tensor of int64 representing the image width.
+        boxes: a float tensor of shape [K, 4], representing the groundtruth
+          boxes in absolute coordinates with respect to the original image size.
+        classes: a int64 tensor of shape [K], representing the class labels of
+          each instances.
+        is_crowds: a bool tensor of shape [K], indicating whether the instance
+          is crowd.
+        areas: a float tensor of shape [K], indicating the area of each
+          instance.
+        masks: a string tensor of shape [K], containing the bytes of the png
+          mask of each instance.
+    """
+    decoder = tf_example_decoder.TfExampleDecoder(
+        include_mask=self._include_mask,
+        regenerate_source_id=self._regenerate_source_id)
+    decoded_tensors = decoder.decode(example)
+
+    image = decoded_tensors['image']
+    image_size = tf.shape(image)[0:2]
+    boxes = box_ops.denormalize_boxes(
+        decoded_tensors['groundtruth_boxes'], image_size)
+
+    source_id = decoded_tensors['source_id']
+    if source_id.dtype is tf.string:
+      source_id = tf.strings.to_number(source_id, out_type=tf.int64)
+
+    groundtruths = {
+        'source_id': source_id,
+        'height': decoded_tensors['height'],
+        'width': decoded_tensors['width'],
+        'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0],
+        'boxes': boxes,
+        'classes': decoded_tensors['groundtruth_classes'],
+        'is_crowds': decoded_tensors['groundtruth_is_crowd'],
+        'areas': decoded_tensors['groundtruth_area'],
+    }
+    if self._include_mask:
+      groundtruths.update({
+          'masks': decoded_tensors['groundtruth_instance_masks_png'],
+      })
+    return groundtruths
+
+  def _build_pipeline(self):
+    """Builds data pipeline to generate groundtruth annotations."""
+    dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
+    dataset = dataset.interleave(
+        map_func=lambda filename: self._dataset_fn(filename).prefetch(1),
+        cycle_length=None,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    dataset = dataset.take(self._num_examples)
+    dataset = dataset.map(self._parse_single_example,
+                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    dataset = dataset.batch(1, drop_remainder=False)
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+    return dataset
+
+  def __call__(self):
+    return self._build_pipeline()
+
+
+def scan_and_generator_annotation_file(file_pattern: str,
+                                       file_type: str,
+                                       num_samples: int,
+                                       include_mask: bool,
+                                       annotation_file: str,
+                                       regenerate_source_id: bool = False):
+  """Scans and generate the COCO-style annotation JSON file given a dataset."""
+  groundtruth_generator = COCOGroundtruthGenerator(
+      file_pattern, file_type, num_samples, include_mask, regenerate_source_id)
+  generate_annotation_file(groundtruth_generator, annotation_file)
+
+
+def generate_annotation_file(groundtruth_generator,
+                             annotation_file):
+  """Generates COCO-style annotation JSON file given a groundtruth generator."""
+  groundtruths = {}
+  logging.info('Loading groundtruth annotations from dataset to memory...')
+  for i, groundtruth in enumerate(groundtruth_generator()):
+    logging.info('generate_annotation_file: Processing annotation %d', i)
+    for k, v in six.iteritems(groundtruth):
+      if k not in groundtruths:
+        groundtruths[k] = [v]
+      else:
+        groundtruths[k].append(v)
+  gt_dataset = convert_groundtruths_to_coco_dataset(groundtruths)
+
+  logging.info('Saving groundtruth annotations to the JSON file...')
+  with tf.io.gfile.GFile(annotation_file, 'w') as f:
+    f.write(json.dumps(gt_dataset))
+  logging.info('Done saving the JSON file...')
--- a/official/vision/evaluation/coco_utils_test.py
+++ b/official/vision/evaluation/coco_utils_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for coco_utils."""
+
+import os
+
+import tensorflow as tf
+
+from official.vision.dataloaders import tfexample_utils
+from official.vision.evaluation import coco_utils
+
+
+class CocoUtilsTest(tf.test.TestCase):
+
+  def test_scan_and_generator_annotation_file(self):
+    num_samples = 10
+    example = tfexample_utils.create_detection_test_example(
+        image_height=512, image_width=512, image_channel=3, num_instances=10)
+    tf_examples = [example] * num_samples
+    data_file = os.path.join(self.create_tempdir(), 'test.tfrecord')
+    tfexample_utils.dump_to_tfrecord(
+        record_file=data_file, tf_examples=tf_examples)
+    annotation_file = os.path.join(self.create_tempdir(), 'annotation.json')
+
+    coco_utils.scan_and_generator_annotation_file(
+        file_pattern=data_file,
+        file_type='tfrecord',
+        num_samples=num_samples,
+        include_mask=True,
+        annotation_file=annotation_file)
+    self.assertTrue(
+        tf.io.gfile.exists(annotation_file),
+        msg='Annotation file {annotation_file} does not exists.')
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/evaluation/iou.py
+++ b/official/vision/evaluation/iou.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""IOU Metrics used for semantic segmentation models."""
+
+import numpy as np
+import tensorflow as tf
+
+
+class PerClassIoU(tf.keras.metrics.Metric):
+  """Computes the per-class Intersection-Over-Union metric.
+
+  Mean Intersection-Over-Union is a common evaluation metric for semantic image
+  segmentation, which first computes the IOU for each semantic class.
+  IOU is defined as follows:
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+  The predictions are accumulated in a confusion matrix, weighted by
+  `sample_weight` and the metric is then calculated from it.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Example:
+
+  >>> # cm = [[1, 1],
+  >>> #        [1, 1]]
+  >>> # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+  >>> # iou = true_positives / (sum_row + sum_col - true_positives))
+  >>> # result = [(1 / (2 + 2 - 1), 1 / (2 + 2 - 1)] = 0.33
+  >>> m = tf.keras.metrics.MeanIoU(num_classes=2)
+  >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1])
+  >>> m.result().numpy()
+  [0.33333334, 0.33333334]
+
+  """
+
+  def __init__(self, num_classes, name=None, dtype=None):
+    """Initializes `PerClassIoU`.
+
+    Args:
+      num_classes: The possible number of labels the prediction task can have.
+        This value must be provided, since a confusion matrix of dimension =
+        [num_classes, num_classes] will be allocated.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+    """
+
+    super(PerClassIoU, self).__init__(name=name, dtype=dtype)
+    self.num_classes = num_classes
+
+    # Variable to accumulate the predictions in the confusion matrix.
+    self.total_cm = self.add_weight(
+        'total_confusion_matrix',
+        shape=(num_classes, num_classes),
+        initializer=tf.compat.v1.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates the confusion matrix statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      IOU per class.
+    """
+
+    y_true = tf.cast(y_true, self._dtype)
+    y_pred = tf.cast(y_pred, self._dtype)
+
+    # Flatten the input if its rank > 1.
+    if y_pred.shape.ndims > 1:
+      y_pred = tf.reshape(y_pred, [-1])
+
+    if y_true.shape.ndims > 1:
+      y_true = tf.reshape(y_true, [-1])
+
+    if sample_weight is not None:
+      sample_weight = tf.cast(sample_weight, self._dtype)
+      if sample_weight.shape.ndims > 1:
+        sample_weight = tf.reshape(sample_weight, [-1])
+
+    # Accumulate the prediction to current confusion matrix.
+    current_cm = tf.math.confusion_matrix(
+        y_true,
+        y_pred,
+        self.num_classes,
+        weights=sample_weight,
+        dtype=self._dtype)
+    return self.total_cm.assign_add(current_cm)
+
+  def result(self):
+    """Compute the mean intersection-over-union via the confusion matrix."""
+    sum_over_row = tf.cast(
+        tf.reduce_sum(self.total_cm, axis=0), dtype=self._dtype)
+    sum_over_col = tf.cast(
+        tf.reduce_sum(self.total_cm, axis=1), dtype=self._dtype)
+    true_positives = tf.cast(
+        tf.linalg.tensor_diag_part(self.total_cm), dtype=self._dtype)
+
+    # sum_over_row + sum_over_col =
+    #     2 * true_positives + false_positives + false_negatives.
+    denominator = sum_over_row + sum_over_col - true_positives
+
+    return tf.math.divide_no_nan(true_positives, denominator)
+
+  def reset_states(self):
+    tf.keras.backend.set_value(
+        self.total_cm, np.zeros((self.num_classes, self.num_classes)))
+
+  def get_config(self):
+    config = {'num_classes': self.num_classes}
+    base_config = super(PerClassIoU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
--- a/official/vision/evaluation/iou_test.py
+++ b/official/vision/evaluation/iou_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for iou metric."""
+
+import tensorflow as tf
+
+from official.vision.evaluation import iou
+
+
+class MeanIoUTest(tf.test.TestCase):
+
+  def test_config(self):
+    m_obj = iou.PerClassIoU(num_classes=2, name='per_class_iou')
+    self.assertEqual(m_obj.name, 'per_class_iou')
+    self.assertEqual(m_obj.num_classes, 2)
+
+    m_obj2 = iou.PerClassIoU.from_config(m_obj.get_config())
+    self.assertEqual(m_obj2.name, 'per_class_iou')
+    self.assertEqual(m_obj2.num_classes, 2)
+
+  def test_unweighted(self):
+    y_pred = [0, 1, 0, 1]
+    y_true = [0, 0, 1, 1]
+
+    m_obj = iou.PerClassIoU(num_classes=2)
+
+    result = m_obj(y_true, y_pred)
+
+    # cm = [[1, 1],
+    #       [1, 1]]
+    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = [1 / (2 + 2 - 1), 1 / (2 + 2 - 1)]
+    self.assertAllClose(expected_result, result, atol=1e-3)
+
+  def test_weighted(self):
+    y_pred = tf.constant([0, 1, 0, 1], dtype=tf.float32)
+    y_true = tf.constant([0, 0, 1, 1])
+    sample_weight = tf.constant([0.2, 0.3, 0.4, 0.1])
+
+    m_obj = iou.PerClassIoU(num_classes=2)
+
+    result = m_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # cm = [[0.2, 0.3],
+    #       [0.4, 0.1]]
+    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = [0.2 / (0.6 + 0.5 - 0.2), 0.1 / (0.4 + 0.5 - 0.1)]
+    self.assertAllClose(expected_result, result, atol=1e-3)
+
+  def test_multi_dim_input(self):
+    y_pred = tf.constant([[0, 1], [0, 1]], dtype=tf.float32)
+    y_true = tf.constant([[0, 0], [1, 1]])
+    sample_weight = tf.constant([[0.2, 0.3], [0.4, 0.1]])
+
+    m_obj = iou.PerClassIoU(num_classes=2)
+
+    result = m_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # cm = [[0.2, 0.3],
+    #       [0.4, 0.1]]
+    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = [0.2 / (0.6 + 0.5 - 0.2), 0.1 / (0.4 + 0.5 - 0.1)]
+    self.assertAllClose(expected_result, result, atol=1e-3)
+
+  def test_zero_valid_entries(self):
+    m_obj = iou.PerClassIoU(num_classes=2)
+    self.assertAllClose(m_obj.result(), [0, 0], atol=1e-3)
+
+  def test_zero_and_non_zero_entries(self):
+    y_pred = tf.constant([1], dtype=tf.float32)
+    y_true = tf.constant([1])
+
+    m_obj = iou.PerClassIoU(num_classes=2)
+    result = m_obj(y_true, y_pred)
+
+    # cm = [[0, 0],
+    #       [0, 1]]
+    # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = [0, 1 / (1 + 1 - 1)]
+    self.assertAllClose(expected_result, result, atol=1e-3)
+
+  def test_update_state_annd_result(self):
+    y_pred = [0, 1, 0, 1]
+    y_true = [0, 0, 1, 1]
+
+    m_obj = iou.PerClassIoU(num_classes=2)
+
+    m_obj.update_state(y_true, y_pred)
+    result = m_obj.result()
+
+    # cm = [[1, 1],
+    #       [1, 1]]
+    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = [1 / (2 + 2 - 1), 1 / (2 + 2 - 1)]
+    self.assertAllClose(expected_result, result, atol=1e-3)
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/evaluation/panoptic_quality.py
+++ b/official/vision/evaluation/panoptic_quality.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implementation of the Panoptic Quality metric.
+
+Panoptic Quality is an instance-based metric for evaluating the task of
+image parsing, aka panoptic segmentation.
+
+Please see the paper for details:
+"Panoptic Segmentation", Alexander Kirillov, Kaiming He, Ross Girshick,
+Carsten Rother and Piotr Dollar. arXiv:1801.00868, 2018.
+
+Note that this metric class is branched from
+https://github.com/tensorflow/models/blob/master/research/deeplab/evaluation/panoptic_quality.py
+"""
+
+import collections
+import numpy as np
+
+_EPSILON = 1e-10
+
+
+def realdiv_maybe_zero(x, y):
+  """Element-wise x / y where y may contain zeros, for those returns 0 too."""
+  return np.where(
+      np.less(np.abs(y), _EPSILON), np.zeros_like(x), np.divide(x, y))
+
+
+def _ids_to_counts(id_array):
+  """Given a numpy array, a mapping from each unique entry to its count."""
+  ids, counts = np.unique(id_array, return_counts=True)
+  return dict(zip(ids, counts))
+
+
+class PanopticQuality:
+  """Metric class for Panoptic Quality.
+
+  "Panoptic Segmentation" by Alexander Kirillov, Kaiming He, Ross Girshick,
+  Carsten Rother, Piotr Dollar.
+  https://arxiv.org/abs/1801.00868
+  """
+
+  def __init__(self, num_categories, ignored_label, max_instances_per_category,
+               offset):
+    """Initialization for PanopticQualityMetric.
+
+    Args:
+      num_categories: The number of segmentation categories (or "classes" in the
+        dataset.
+      ignored_label: A category id that is ignored in evaluation, e.g. the void
+        label as defined in COCO panoptic segmentation dataset.
+      max_instances_per_category: The maximum number of instances for each
+        category. Used in ensuring unique instance labels.
+      offset: The maximum number of unique labels. This is used, by multiplying
+        the ground-truth labels, to generate unique ids for individual regions
+        of overlap between groundtruth and predicted segments.
+    """
+    self.num_categories = num_categories
+    self.ignored_label = ignored_label
+    self.max_instances_per_category = max_instances_per_category
+    self.offset = offset
+    self.reset()
+
+  def _naively_combine_labels(self, category_mask, instance_mask):
+    """Naively creates a combined label array from categories and instances."""
+    return (category_mask.astype(np.uint32) * self.max_instances_per_category +
+            instance_mask.astype(np.uint32))
+
+  def compare_and_accumulate(self, groundtruths, predictions):
+    """Compares predicted segmentation with groundtruth, accumulates its metric.
+
+    It is not assumed that instance ids are unique across different categories.
+    See for example combine_semantic_and_instance_predictions.py in official
+    PanopticAPI evaluation code for issues to consider when fusing category
+    and instance labels.
+
+    Instances ids of the ignored category have the meaning that id 0 is "void"
+    and remaining ones are crowd instances.
+
+    Args:
+      groundtruths: A dictionary contains groundtruth labels. It should contain
+        the following fields.
+        - category_mask: A 2D numpy uint16 array of groundtruth per-pixel
+          category labels.
+        - instance_mask: A 2D numpy uint16 array of groundtruth instance labels.
+      predictions: A dictionary contains the model outputs. It should contain
+        the following fields.
+        - category_array: A 2D numpy uint16 array of predicted per-pixel
+          category labels.
+        - instance_array: A 2D numpy uint16 array of predicted instance labels.
+    """
+    groundtruth_category_mask = groundtruths['category_mask']
+    groundtruth_instance_mask = groundtruths['instance_mask']
+    predicted_category_mask = predictions['category_mask']
+    predicted_instance_mask = predictions['instance_mask']
+
+    # First, combine the category and instance labels so that every unique
+    # value for (category, instance) is assigned a unique integer label.
+    pred_segment_id = self._naively_combine_labels(predicted_category_mask,
+                                                   predicted_instance_mask)
+    gt_segment_id = self._naively_combine_labels(groundtruth_category_mask,
+                                                 groundtruth_instance_mask)
+
+    # Pre-calculate areas for all groundtruth and predicted segments.
+    gt_segment_areas = _ids_to_counts(gt_segment_id)
+    pred_segment_areas = _ids_to_counts(pred_segment_id)
+
+    # We assume there is only one void segment and it has instance id = 0.
+    void_segment_id = self.ignored_label * self.max_instances_per_category
+
+    # There may be other ignored groundtruth segments with instance id > 0, find
+    # those ids using the unique segment ids extracted with the area computation
+    # above.
+    ignored_segment_ids = {
+        gt_segment_id for gt_segment_id in gt_segment_areas
+        if (gt_segment_id //
+            self.max_instances_per_category) == self.ignored_label
+    }
+
+    # Next, combine the groundtruth and predicted labels. Dividing up the pixels
+    # based on which groundtruth segment and which predicted segment they belong
+    # to, this will assign a different 32-bit integer label to each choice
+    # of (groundtruth segment, predicted segment), encoded as
+    #   gt_segment_id * offset + pred_segment_id.
+    intersection_id_array = (
+        gt_segment_id.astype(np.uint64) * self.offset +
+        pred_segment_id.astype(np.uint64))
+
+    # For every combination of (groundtruth segment, predicted segment) with a
+    # non-empty intersection, this counts the number of pixels in that
+    # intersection.
+    intersection_areas = _ids_to_counts(intersection_id_array)
+
+    # Helper function that computes the area of the overlap between a predicted
+    # segment and the ground-truth void/ignored segment.
+    def prediction_void_overlap(pred_segment_id):
+      void_intersection_id = void_segment_id * self.offset + pred_segment_id
+      return intersection_areas.get(void_intersection_id, 0)
+
+    # Compute overall ignored overlap.
+    def prediction_ignored_overlap(pred_segment_id):
+      total_ignored_overlap = 0
+      for ignored_segment_id in ignored_segment_ids:
+        intersection_id = ignored_segment_id * self.offset + pred_segment_id
+        total_ignored_overlap += intersection_areas.get(intersection_id, 0)
+      return total_ignored_overlap
+
+    # Sets that are populated with which segments groundtruth/predicted segments
+    # have been matched with overlapping predicted/groundtruth segments
+    # respectively.
+    gt_matched = set()
+    pred_matched = set()
+
+    # Calculate IoU per pair of intersecting segments of the same category.
+    for intersection_id, intersection_area in intersection_areas.items():
+      gt_segment_id = int(intersection_id // self.offset)
+      pred_segment_id = int(intersection_id % self.offset)
+
+      gt_category = int(gt_segment_id // self.max_instances_per_category)
+      pred_category = int(pred_segment_id // self.max_instances_per_category)
+      if gt_category != pred_category:
+        continue
+
+      # Union between the groundtruth and predicted segments being compared does
+      # not include the portion of the predicted segment that consists of
+      # groundtruth "void" pixels.
+      union = (
+          gt_segment_areas[gt_segment_id] +
+          pred_segment_areas[pred_segment_id] - intersection_area -
+          prediction_void_overlap(pred_segment_id))
+      iou = intersection_area / union
+      if iou > 0.5:
+        self.tp_per_class[gt_category] += 1
+        self.iou_per_class[gt_category] += iou
+        gt_matched.add(gt_segment_id)
+        pred_matched.add(pred_segment_id)
+
+    # Count false negatives for each category.
+    for gt_segment_id in gt_segment_areas:
+      if gt_segment_id in gt_matched:
+        continue
+      category = gt_segment_id // self.max_instances_per_category
+      # Failing to detect a void segment is not a false negative.
+      if category == self.ignored_label:
+        continue
+      self.fn_per_class[category] += 1
+
+    # Count false positives for each category.
+    for pred_segment_id in pred_segment_areas:
+      if pred_segment_id in pred_matched:
+        continue
+      # A false positive is not penalized if is mostly ignored in the
+      # groundtruth.
+      if (prediction_ignored_overlap(pred_segment_id) /
+          pred_segment_areas[pred_segment_id]) > 0.5:
+        continue
+      category = pred_segment_id // self.max_instances_per_category
+      self.fp_per_class[category] += 1
+
+  def _valid_categories(self):
+    """Categories with a "valid" value for the metric, have > 0 instances.
+
+    We will ignore the `ignore_label` class and other classes which have
+    `tp + fn + fp = 0`.
+
+    Returns:
+      Boolean array of shape `[num_categories]`.
+    """
+    valid_categories = np.not_equal(
+        self.tp_per_class + self.fn_per_class + self.fp_per_class, 0)
+    if self.ignored_label >= 0 and self.ignored_label < self.num_categories:
+      valid_categories[self.ignored_label] = False
+    return valid_categories
+
+  def result_per_category(self):
+    """For supported metrics, return individual per-category metric values.
+
+    Returns:
+      A dictionary contains all per-class metrics, each metrics is a numpy array
+      of shape `[self.num_categories]`, where index `i` is the metrics value
+      over only that category.
+    """
+    sq_per_class = realdiv_maybe_zero(self.iou_per_class, self.tp_per_class)
+    rq_per_class = realdiv_maybe_zero(
+        self.tp_per_class,
+        self.tp_per_class + 0.5 * self.fn_per_class + 0.5 * self.fp_per_class)
+    return {
+        'sq_per_class': sq_per_class,
+        'rq_per_class': rq_per_class,
+        'pq_per_class': np.multiply(sq_per_class, rq_per_class)
+    }
+
+  def result(self, is_thing=None):
+    """Computes and returns the detailed metric results over all comparisons.
+
+    Args:
+      is_thing: A boolean array of length `num_categories`. The entry
+        `is_thing[category_id]` is True iff that category is a "thing" category
+        instead of "stuff."
+
+    Returns:
+      A dictionary with a breakdown of metrics and/or metric factors by things,
+      stuff, and all categories.
+    """
+    results = self.result_per_category()
+    valid_categories = self._valid_categories()
+    # If known, break down which categories are valid _and_ things/stuff.
+    category_sets = collections.OrderedDict()
+    category_sets['All'] = valid_categories
+    if is_thing is not None:
+      category_sets['Things'] = np.logical_and(valid_categories, is_thing)
+      category_sets['Stuff'] = np.logical_and(valid_categories,
+                                              np.logical_not(is_thing))
+
+    for category_set_name, in_category_set in category_sets.items():
+      if np.any(in_category_set):
+        results.update({
+            f'{category_set_name}_pq':
+                np.mean(results['pq_per_class'][in_category_set]),
+            f'{category_set_name}_sq':
+                np.mean(results['sq_per_class'][in_category_set]),
+            f'{category_set_name}_rq':
+                np.mean(results['rq_per_class'][in_category_set]),
+            # The number of categories in this subset.
+            f'{category_set_name}_num_categories':
+                np.sum(in_category_set.astype(np.int32)),
+        })
+      else:
+        results[category_set_name] = {
+            f'{category_set_name}_pq': 0.,
+            f'{category_set_name}_sq': 0.,
+            f'{category_set_name}_rq': 0.,
+            f'{category_set_name}_num_categories': 0
+        }
+
+    return results
+
+  def reset(self):
+    """Resets the accumulation to the metric class's state at initialization."""
+    self.iou_per_class = np.zeros(self.num_categories, dtype=np.float64)
+    self.tp_per_class = np.zeros(self.num_categories, dtype=np.float64)
+    self.fn_per_class = np.zeros(self.num_categories, dtype=np.float64)
+    self.fp_per_class = np.zeros(self.num_categories, dtype=np.float64)
--- a/official/vision/evaluation/panoptic_quality_evaluator.py
+++ b/official/vision/evaluation/panoptic_quality_evaluator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The panoptic quality evaluator.
+
+The following snippet demonstrates the use of interfaces:
+
+  evaluator = PanopticQualityEvaluator(...)
+  for _ in range(num_evals):
+    for _ in range(num_batches_per_eval):
+      predictions, groundtruth = predictor.predict(...)  # pop a batch.
+      evaluator.update_state(groundtruths, predictions)
+    evaluator.result()  # finish one full eval and reset states.
+
+See also: https://github.com/cocodataset/cocoapi/
+"""
+
+import numpy as np
+import tensorflow as tf
+
+from official.vision.evaluation import panoptic_quality
+
+
+def _crop_padding(mask, image_info):
+  """Crops padded masks to match original image shape.
+
+  Args:
+    mask: a padded mask tensor.
+    image_info: a tensor that holds information about original and preprocessed
+      images.
+  Returns:
+    cropped and padded masks: tf.Tensor
+  """
+  image_shape = tf.cast(image_info[0, :], tf.int32)
+  mask = tf.image.crop_to_bounding_box(
+      tf.expand_dims(mask, axis=-1), 0, 0,
+      image_shape[0], image_shape[1])
+  return tf.expand_dims(mask[:, :, 0], axis=0)
+
+
+class PanopticQualityEvaluator:
+  """Panoptic Quality metric class."""
+
+  def __init__(self, num_categories, ignored_label, max_instances_per_category,
+               offset, is_thing=None, rescale_predictions=False):
+    """Constructs Panoptic Quality evaluation class.
+
+    The class provides the interface to Panoptic Quality metrics_fn.
+
+    Args:
+      num_categories: The number of segmentation categories (or "classes" in the
+        dataset.
+      ignored_label: A category id that is ignored in evaluation, e.g. the void
+        label as defined in COCO panoptic segmentation dataset.
+      max_instances_per_category: The maximum number of instances for each
+        category. Used in ensuring unique instance labels.
+      offset: The maximum number of unique labels. This is used, by multiplying
+        the ground-truth labels, to generate unique ids for individual regions
+        of overlap between groundtruth and predicted segments.
+      is_thing: A boolean array of length `num_categories`. The entry
+        `is_thing[category_id]` is True iff that category is a "thing" category
+        instead of "stuff." Default to `None`, and it means categories are not
+        classified into these two categories.
+      rescale_predictions: `bool`, whether to scale back prediction to original
+        image sizes. If True, groundtruths['image_info'] is used to rescale
+        predictions.
+    """
+    self._pq_metric_module = panoptic_quality.PanopticQuality(
+        num_categories, ignored_label, max_instances_per_category, offset)
+    self._is_thing = is_thing
+    self._rescale_predictions = rescale_predictions
+    self._required_prediction_fields = ['category_mask', 'instance_mask']
+    self._required_groundtruth_fields = ['category_mask', 'instance_mask']
+    self.reset_states()
+
+  @property
+  def name(self):
+    return 'panoptic_quality'
+
+  def reset_states(self):
+    """Resets internal states for a fresh run."""
+    self._pq_metric_module.reset()
+
+  def result(self):
+    """Evaluates detection results, and reset_states."""
+    results = self._pq_metric_module.result(self._is_thing)
+    self.reset_states()
+    return results
+
+  def _convert_to_numpy(self, groundtruths, predictions):
+    """Converts tesnors to numpy arrays."""
+    if groundtruths:
+      labels = tf.nest.map_structure(lambda x: x.numpy(), groundtruths)
+      numpy_groundtruths = {}
+      for key, val in labels.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_groundtruths[key] = val
+    else:
+      numpy_groundtruths = groundtruths
+
+    if predictions:
+      outputs = tf.nest.map_structure(lambda x: x.numpy(), predictions)
+      numpy_predictions = {}
+      for key, val in outputs.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_predictions[key] = val
+    else:
+      numpy_predictions = predictions
+
+    return numpy_groundtruths, numpy_predictions
+
+  def update_state(self, groundtruths, predictions):
+    """Update and aggregate detection results and groundtruth data.
+
+    Args:
+      groundtruths: a dictionary of Tensors including the fields below. See also
+        different parsers under `../dataloader` for more details.
+        Required fields:
+          - category_mask: a numpy array of uint16 of shape [batch_size, H, W].
+          - instance_mask: a numpy array of uint16 of shape [batch_size, H, W].
+          - image_info: [batch, 4, 2], a tensor that holds information about
+          original and preprocessed images. Each entry is in the format of
+          [[original_height, original_width], [input_height, input_width],
+          [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+          desired_width] is the actual scaled image size, and [y_scale, x_scale]
+          is the scaling factor, which is the ratio of scaled dimension /
+          original dimension.
+      predictions: a dictionary of tensors including the fields below. See
+        different parsers under `../dataloader` for more details.
+        Required fields:
+          - category_mask: a numpy array of uint16 of shape [batch_size, H, W].
+          - instance_mask: a numpy array of uint16 of shape [batch_size, H, W].
+
+    Raises:
+      ValueError: if the required prediction or groundtruth fields are not
+        present in the incoming `predictions` or `groundtruths`.
+    """
+    groundtruths, predictions = self._convert_to_numpy(groundtruths,
+                                                       predictions)
+    for k in self._required_prediction_fields:
+      if k not in predictions:
+        raise ValueError(
+            'Missing the required key `{}` in predictions!'.format(k))
+
+    for k in self._required_groundtruth_fields:
+      if k not in groundtruths:
+        raise ValueError(
+            'Missing the required key `{}` in groundtruths!'.format(k))
+
+    if self._rescale_predictions:
+      for idx in range(len(groundtruths['category_mask'])):
+        image_info = groundtruths['image_info'][idx]
+        groundtruths_ = {
+            'category_mask':
+                _crop_padding(groundtruths['category_mask'][idx], image_info),
+            'instance_mask':
+                _crop_padding(groundtruths['instance_mask'][idx], image_info),
+            }
+        predictions_ = {
+            'category_mask':
+                _crop_padding(predictions['category_mask'][idx], image_info),
+            'instance_mask':
+                _crop_padding(predictions['instance_mask'][idx], image_info),
+            }
+        groundtruths_, predictions_ = self._convert_to_numpy(
+            groundtruths_, predictions_)
+
+        self._pq_metric_module.compare_and_accumulate(
+            groundtruths_, predictions_)
+    else:
+      self._pq_metric_module.compare_and_accumulate(groundtruths, predictions)
--- a/official/vision/evaluation/panoptic_quality_evaluator_test.py
+++ b/official/vision/evaluation/panoptic_quality_evaluator_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for panoptic_quality_evaluator."""
+
+import numpy as np
+import tensorflow as tf
+
+from official.vision.evaluation import panoptic_quality_evaluator
+
+
+class PanopticQualityEvaluatorTest(tf.test.TestCase):
+
+  def test_multiple_batches(self):
+    category_mask = np.zeros([6, 6], np.uint16)
+    groundtruth_instance_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 2, 2, 2, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                                         dtype=np.uint16)
+
+    good_det_instance_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 2, 2, 2, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                                      dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask':
+            tf.convert_to_tensor([category_mask]),
+        'instance_mask':
+            tf.convert_to_tensor([groundtruth_instance_mask]),
+        'image_info':
+            tf.convert_to_tensor([[[6, 6], [6, 6], [1.0, 1.0], [0, 0]]],
+                                 dtype=tf.float32)
+    }
+    predictions = {
+        'category_mask': tf.convert_to_tensor([category_mask]),
+        'instance_mask': tf.convert_to_tensor([good_det_instance_mask])
+    }
+
+    pq_evaluator = panoptic_quality_evaluator.PanopticQualityEvaluator(
+        num_categories=1,
+        ignored_label=2,
+        max_instances_per_category=16,
+        offset=16,
+        rescale_predictions=True)
+    for _ in range(2):
+      pq_evaluator.update_state(groundtruths, predictions)
+
+    bad_det_instance_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 2, 2, 1],
+        [1, 1, 1, 2, 2, 1],
+        [1, 1, 1, 2, 2, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                                     dtype=np.uint16)
+    predictions['instance_mask'] = tf.convert_to_tensor([bad_det_instance_mask])
+    for _ in range(2):
+      pq_evaluator.update_state(groundtruths, predictions)
+
+    results = pq_evaluator.result()
+    np.testing.assert_array_equal(results['pq_per_class'],
+                                  [((28 / 30 + 6 / 8) + (27 / 32)) / 2 / 2])
+    np.testing.assert_array_equal(results['rq_per_class'], [3 / 4])
+    np.testing.assert_array_equal(results['sq_per_class'],
+                                  [((28 / 30 + 6 / 8) + (27 / 32)) / 3])
+    self.assertAlmostEqual(results['All_pq'], 0.63177083)
+    self.assertAlmostEqual(results['All_rq'], 0.75)
+    self.assertAlmostEqual(results['All_sq'], 0.84236111)
+    self.assertEqual(results['All_num_categories'], 1)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/evaluation/panoptic_quality_test.py
+++ b/official/vision/evaluation/panoptic_quality_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Panoptic Quality metric.
+
+Note that this metric test class is branched from
+https://github.com/tensorflow/models/blob/master/research/deeplab/evaluation/panoptic_quality_test.py
+"""
+
+
+from absl.testing import absltest
+import numpy as np
+
+from official.vision.evaluation import panoptic_quality
+
+
+class PanopticQualityTest(absltest.TestCase):
+
+  def test_perfect_match(self):
+    category_mask = np.zeros([6, 6], np.uint16)
+    instance_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 2, 2, 1, 1, 1],
+        [1, 2, 1, 1, 1, 1],
+    ],
+                             dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask': category_mask,
+        'instance_mask': instance_mask
+    }
+    predictions = {
+        'category_mask': category_mask,
+        'instance_mask': instance_mask
+    }
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=1,
+        ignored_label=2,
+        max_instances_per_category=16,
+        offset=16)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    np.testing.assert_array_equal(pq_metric.iou_per_class, [2.0])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [2])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [0])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [0])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [1.0])
+    np.testing.assert_array_equal(results['rq_per_class'], [1.0])
+    np.testing.assert_array_equal(results['sq_per_class'], [1.0])
+    self.assertAlmostEqual(results['All_pq'], 1.0)
+    self.assertAlmostEqual(results['All_rq'], 1.0)
+    self.assertAlmostEqual(results['All_sq'], 1.0)
+    self.assertEqual(results['All_num_categories'], 1)
+
+  def test_totally_wrong(self):
+    category_mask = np.array([
+        [0, 0, 0, 0, 0, 0],
+        [0, 1, 0, 0, 1, 0],
+        [0, 1, 1, 1, 1, 0],
+        [0, 1, 1, 1, 1, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+    ],
+                             dtype=np.uint16)
+    instance_mask = np.zeros([6, 6], np.uint16)
+
+    groundtruths = {
+        'category_mask': category_mask,
+        'instance_mask': instance_mask
+    }
+    predictions = {
+        'category_mask': 1 - category_mask,
+        'instance_mask': instance_mask
+    }
+
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=2,
+        ignored_label=2,
+        max_instances_per_category=1,
+        offset=16)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+    np.testing.assert_array_equal(pq_metric.iou_per_class, [0.0, 0.0])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [0, 0])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [1, 1])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [1, 1])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [0.0, 0.0])
+    np.testing.assert_array_equal(results['rq_per_class'], [0.0, 0.0])
+    np.testing.assert_array_equal(results['sq_per_class'], [0.0, 0.0])
+    self.assertAlmostEqual(results['All_pq'], 0.0)
+    self.assertAlmostEqual(results['All_rq'], 0.0)
+    self.assertAlmostEqual(results['All_sq'], 0.0)
+    self.assertEqual(results['All_num_categories'], 2)
+
+  def test_matches_by_iou(self):
+    groundtruth_instance_mask = np.array(
+        [
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 2, 2, 2, 1],
+            [1, 2, 2, 2, 2, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+        ],
+        dtype=np.uint16)
+
+    good_det_instance_mask = np.array(
+        [
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 2, 2, 2, 2, 1],
+            [1, 2, 2, 2, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+        ],
+        dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask': np.zeros_like(groundtruth_instance_mask),
+        'instance_mask': groundtruth_instance_mask
+    }
+    predictions = {
+        'category_mask': np.zeros_like(good_det_instance_mask),
+        'instance_mask': good_det_instance_mask
+    }
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=1,
+        ignored_label=2,
+        max_instances_per_category=16,
+        offset=16)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    # iou(1, 1) = 28/30
+    # iou(2, 2) = 6 / 8
+    np.testing.assert_array_almost_equal(pq_metric.iou_per_class,
+                                         [28 / 30 + 6 / 8])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [2])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [0])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [0])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'],
+                                  [(28 / 30 + 6 / 8) / 2])
+    np.testing.assert_array_equal(results['rq_per_class'], [1.0])
+    np.testing.assert_array_equal(results['sq_per_class'],
+                                  [(28 / 30 + 6 / 8) / 2])
+    self.assertAlmostEqual(results['All_pq'], (28 / 30 + 6 / 8) / 2)
+    self.assertAlmostEqual(results['All_rq'], 1.0)
+    self.assertAlmostEqual(results['All_sq'], (28 / 30 + 6 / 8) / 2)
+    self.assertEqual(results['All_num_categories'], 1)
+
+    bad_det_instance_mask = np.array(
+        [
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 2, 2, 1],
+            [1, 1, 1, 2, 2, 1],
+            [1, 1, 1, 2, 2, 1],
+            [1, 1, 1, 1, 1, 1],
+        ],
+        dtype=np.uint16)
+    predictions['instance_mask'] = bad_det_instance_mask
+
+    pq_metric.reset()
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    # iou(1, 1) = 27/32
+    np.testing.assert_array_almost_equal(pq_metric.iou_per_class, [27 / 32])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [1])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [1])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [1])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [27 / 32 / 2])
+    np.testing.assert_array_equal(results['rq_per_class'], [0.5])
+    np.testing.assert_array_equal(results['sq_per_class'], [27 / 32])
+    self.assertAlmostEqual(results['All_pq'], 27 / 32 / 2)
+    self.assertAlmostEqual(results['All_rq'], 0.5)
+    self.assertAlmostEqual(results['All_sq'], 27 / 32)
+    self.assertEqual(results['All_num_categories'], 1)
+
+  def test_wrong_instances(self):
+    category_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 2, 2, 1, 2, 2],
+        [1, 2, 2, 1, 2, 2],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                             dtype=np.uint16)
+    groundtruth_instance_mask = np.zeros([6, 6], dtype=np.uint16)
+    predicted_instance_mask = np.array([
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+    ],
+                                       dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask': category_mask,
+        'instance_mask': groundtruth_instance_mask
+    }
+    predictions = {
+        'category_mask': category_mask,
+        'instance_mask': predicted_instance_mask
+    }
+
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=3,
+        ignored_label=0,
+        max_instances_per_category=10,
+        offset=100)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    np.testing.assert_array_equal(pq_metric.iou_per_class, [0.0, 1.0, 0.0])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [0, 1, 0])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [0, 0, 1])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [0, 0, 2])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [0.0, 1.0, 0.0])
+    np.testing.assert_array_equal(results['rq_per_class'], [0.0, 1.0, 0.0])
+    np.testing.assert_array_equal(results['sq_per_class'], [0.0, 1.0, 0.0])
+    self.assertAlmostEqual(results['All_pq'], 0.5)
+    self.assertAlmostEqual(results['All_rq'], 0.5)
+    self.assertAlmostEqual(results['All_sq'], 0.5)
+    self.assertEqual(results['All_num_categories'], 2)
+
+  def test_instance_order_is_arbitrary(self):
+    category_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 2, 2, 1, 2, 2],
+        [1, 2, 2, 1, 2, 2],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                             dtype=np.uint16)
+    groundtruth_instance_mask = np.array([
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 1, 1, 0, 0, 0],
+        [0, 1, 1, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+    ],
+                                         dtype=np.uint16)
+    predicted_instance_mask = np.array([
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+    ],
+                                       dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask': category_mask,
+        'instance_mask': groundtruth_instance_mask
+    }
+    predictions = {
+        'category_mask': category_mask,
+        'instance_mask': predicted_instance_mask
+    }
+
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=3,
+        ignored_label=0,
+        max_instances_per_category=10,
+        offset=100)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    np.testing.assert_array_equal(pq_metric.iou_per_class, [0.0, 1.0, 2.0])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [0, 1, 2])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [0, 0, 0])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [0, 0, 0])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [0.0, 1.0, 1.0])
+    np.testing.assert_array_equal(results['rq_per_class'], [0.0, 1.0, 1.0])
+    np.testing.assert_array_equal(results['sq_per_class'], [0.0, 1.0, 1.0])
+    self.assertAlmostEqual(results['All_pq'], 1.0)
+    self.assertAlmostEqual(results['All_rq'], 1.0)
+    self.assertAlmostEqual(results['All_sq'], 1.0)
+    self.assertEqual(results['All_num_categories'], 2)
+
+
+if __name__ == '__main__':
+  absltest.main()
--- a/official/vision/evaluation/segmentation_metrics.py
+++ b/official/vision/evaluation/segmentation_metrics.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Metrics for segmentation."""
+import tensorflow as tf
+
+from official.vision.evaluation import iou
+
+
+class MeanIoU(tf.keras.metrics.MeanIoU):
+  """Mean IoU metric for semantic segmentation.
+
+  This class utilizes tf.keras.metrics.MeanIoU to perform batched mean iou when
+  both input images and groundtruth masks are resized to the same size
+  (rescale_predictions=False). It also computes mean iou on groundtruth original
+  sizes, in which case, each prediction is rescaled back to the original image
+  size.
+  """
+
+  def __init__(
+      self, num_classes, rescale_predictions=False, name=None, dtype=None):
+    """Constructs Segmentation evaluator class.
+
+    Args:
+      num_classes: `int`, number of classes.
+      rescale_predictions: `bool`, whether to scale back prediction to original
+        image sizes. If True, y_true['image_info'] is used to rescale
+        predictions.
+      name: `str`, name of the metric instance..
+      dtype: data type of the metric result.
+    """
+    self._rescale_predictions = rescale_predictions
+    super().__init__(num_classes=num_classes, name=name, dtype=dtype)
+
+  def update_state(self, y_true, y_pred):
+    """Updates metric state.
+
+    Args:
+      y_true: `dict`, dictionary with the following name, and key values.
+        - masks: [batch, width, height, 1], groundtruth masks.
+        - valid_masks: [batch, width, height, 1], valid elements in the mask.
+        - image_info: [batch, 4, 2], a tensor that holds information about
+          original and preprocessed images. Each entry is in the format of
+          [[original_height, original_width], [input_height, input_width],
+          [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+          desired_width] is the actual scaled image size, and [y_scale, x_scale]
+          is the scaling factor, which is the ratio of scaled dimension /
+          original dimension.
+      y_pred: Tensor [batch, width_p, height_p, num_classes], predicated masks.
+    """
+    predictions = y_pred
+    masks = y_true['masks']
+    valid_masks = y_true['valid_masks']
+    images_info = y_true['image_info']
+
+    if isinstance(predictions, tuple) or isinstance(predictions, list):
+      predictions = tf.concat(predictions, axis=0)
+      masks = tf.concat(masks, axis=0)
+      valid_masks = tf.concat(valid_masks, axis=0)
+      images_info = tf.concat(images_info, axis=0)
+
+    # Ignore mask elements is set to zero for argmax op.
+    masks = tf.where(valid_masks, masks, tf.zeros_like(masks))
+
+    if self._rescale_predictions:
+      # This part can only run on cpu/gpu due to dynamic image resizing.
+      for i in range(tf.shape(predictions)[0]):
+        mask = masks[i]
+        valid_mask = valid_masks[i]
+        predicted_mask = predictions[i]
+        image_info = images_info[i]
+
+        rescale_size = tf.cast(
+            tf.math.ceil(image_info[1, :] / image_info[2, :]), tf.int32)
+        image_shape = tf.cast(image_info[0, :], tf.int32)
+        offsets = tf.cast(image_info[3, :], tf.int32)
+
+        predicted_mask = tf.image.resize(
+            predicted_mask,
+            rescale_size,
+            method=tf.image.ResizeMethod.BILINEAR)
+
+        predicted_mask = tf.image.crop_to_bounding_box(predicted_mask,
+                                                       offsets[0], offsets[1],
+                                                       image_shape[0],
+                                                       image_shape[1])
+        mask = tf.image.crop_to_bounding_box(mask, 0, 0, image_shape[0],
+                                             image_shape[1])
+        valid_mask = tf.image.crop_to_bounding_box(valid_mask, 0, 0,
+                                                   image_shape[0],
+                                                   image_shape[1])
+
+        predicted_mask = tf.argmax(predicted_mask, axis=2)
+        flatten_predictions = tf.reshape(predicted_mask, shape=[1, -1])
+        flatten_masks = tf.reshape(mask, shape=[1, -1])
+        flatten_valid_masks = tf.reshape(valid_mask, shape=[1, -1])
+        super(MeanIoU, self).update_state(
+            flatten_masks, flatten_predictions,
+            tf.cast(flatten_valid_masks, tf.float32))
+
+    else:
+      predictions = tf.image.resize(
+          predictions,
+          tf.shape(masks)[1:3],
+          method=tf.image.ResizeMethod.BILINEAR)
+      predictions = tf.argmax(predictions, axis=3)
+      flatten_predictions = tf.reshape(predictions, shape=[-1])
+      flatten_masks = tf.reshape(masks, shape=[-1])
+      flatten_valid_masks = tf.reshape(valid_masks, shape=[-1])
+
+      super().update_state(flatten_masks, flatten_predictions,
+                           tf.cast(flatten_valid_masks, tf.float32))
+
+
+class PerClassIoU(iou.PerClassIoU):
+  """Per Class IoU metric for semantic segmentation.
+
+  This class utilizes iou.PerClassIoU to perform batched per class
+  iou when both input images and groundtruth masks are resized to the same size
+  (rescale_predictions=False). It also computes per class iou on groundtruth
+  original sizes, in which case, each prediction is rescaled back to the
+  original image size.
+  """
+
+  def __init__(
+      self, num_classes, rescale_predictions=False, name=None, dtype=None):
+    """Constructs Segmentation evaluator class.
+
+    Args:
+      num_classes: `int`, number of classes.
+      rescale_predictions: `bool`, whether to scale back prediction to original
+        image sizes. If True, y_true['image_info'] is used to rescale
+        predictions.
+      name: `str`, name of the metric instance..
+      dtype: data type of the metric result.
+    """
+    self._rescale_predictions = rescale_predictions
+    super().__init__(num_classes=num_classes, name=name, dtype=dtype)
+
+  def update_state(self, y_true, y_pred):
+    """Updates metric state.
+
+    Args:
+      y_true: `dict`, dictionary with the following name, and key values.
+        - masks: [batch, width, height, 1], groundtruth masks.
+        - valid_masks: [batch, width, height, 1], valid elements in the mask.
+        - image_info: [batch, 4, 2], a tensor that holds information about
+          original and preprocessed images. Each entry is in the format of
+          [[original_height, original_width], [input_height, input_width],
+          [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+          desired_width] is the actual scaled image size, and [y_scale, x_scale]
+          is the scaling factor, which is the ratio of scaled dimension /
+          original dimension.
+      y_pred: Tensor [batch, width_p, height_p, num_classes], predicated masks.
+    """
+    predictions = y_pred
+    masks = y_true['masks']
+    valid_masks = y_true['valid_masks']
+    images_info = y_true['image_info']
+
+    if isinstance(predictions, tuple) or isinstance(predictions, list):
+      predictions = tf.concat(predictions, axis=0)
+      masks = tf.concat(masks, axis=0)
+      valid_masks = tf.concat(valid_masks, axis=0)
+      images_info = tf.concat(images_info, axis=0)
+
+    # Ignore mask elements is set to zero for argmax op.
+    masks = tf.where(valid_masks, masks, tf.zeros_like(masks))
+
+    if self._rescale_predictions:
+      # This part can only run on cpu/gpu due to dynamic image resizing.
+      for i in range(tf.shape(predictions)[0]):
+        mask = masks[i]
+        valid_mask = valid_masks[i]
+        predicted_mask = predictions[i]
+        image_info = images_info[i]
+
+        rescale_size = tf.cast(
+            tf.math.ceil(image_info[1, :] / image_info[2, :]), tf.int32)
+        image_shape = tf.cast(image_info[0, :], tf.int32)
+        offsets = tf.cast(image_info[3, :], tf.int32)
+
+        predicted_mask = tf.image.resize(
+            predicted_mask,
+            rescale_size,
+            method=tf.image.ResizeMethod.BILINEAR)
+
+        predicted_mask = tf.image.crop_to_bounding_box(predicted_mask,
+                                                       offsets[0], offsets[1],
+                                                       image_shape[0],
+                                                       image_shape[1])
+        mask = tf.image.crop_to_bounding_box(mask, 0, 0, image_shape[0],
+                                             image_shape[1])
+        valid_mask = tf.image.crop_to_bounding_box(valid_mask, 0, 0,
+                                                   image_shape[0],
+                                                   image_shape[1])
+
+        predicted_mask = tf.argmax(predicted_mask, axis=2)
+        flatten_predictions = tf.reshape(predicted_mask, shape=[1, -1])
+        flatten_masks = tf.reshape(mask, shape=[1, -1])
+        flatten_valid_masks = tf.reshape(valid_mask, shape=[1, -1])
+        super().update_state(flatten_masks, flatten_predictions,
+                             tf.cast(flatten_valid_masks, tf.float32))
+
+    else:
+      predictions = tf.image.resize(
+          predictions,
+          tf.shape(masks)[1:3],
+          method=tf.image.ResizeMethod.BILINEAR)
+      predictions = tf.argmax(predictions, axis=3)
+      flatten_predictions = tf.reshape(predictions, shape=[-1])
+      flatten_masks = tf.reshape(masks, shape=[-1])
+      flatten_valid_masks = tf.reshape(valid_masks, shape=[-1])
+
+      super().update_state(flatten_masks, flatten_predictions,
+                           tf.cast(flatten_valid_masks, tf.float32))
--- a/official/vision/evaluation/segmentation_metrics_test.py
+++ b/official/vision/evaluation/segmentation_metrics_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for segmentation_metrics."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.vision.evaluation import segmentation_metrics
+
+
+class SegmentationMetricsTest(parameterized.TestCase, tf.test.TestCase):
+
+  def _create_test_data(self):
+    y_pred_cls0 = np.expand_dims(
+        np.array([[1, 1, 0], [1, 1, 0], [0, 0, 0]], dtype=np.uint16),
+        axis=(0, -1))
+    y_pred_cls1 = np.expand_dims(
+        np.array([[0, 0, 0], [0, 0, 1], [0, 0, 1]], dtype=np.uint16),
+        axis=(0, -1))
+    y_pred = np.concatenate((y_pred_cls0, y_pred_cls1), axis=-1)
+
+    y_true = {
+        'masks':
+            np.expand_dims(
+                np.array([[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
+                          [0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1],
+                          [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]],
+                         dtype=np.uint16),
+                axis=(0, -1)),
+        'valid_masks':
+            np.ones([1, 6, 6, 1], dtype=np.uint16),
+        'image_info':
+            np.array([[[6, 6], [3, 3], [0.5, 0.5], [0, 0]]], dtype=np.float32)
+    }
+    return y_pred, y_true
+
+  @parameterized.parameters(True, False)
+  def test_mean_iou_metric(self, rescale_predictions):
+    tf.config.experimental_run_functions_eagerly(True)
+    mean_iou_metric = segmentation_metrics.MeanIoU(
+        num_classes=2, rescale_predictions=rescale_predictions)
+    y_pred, y_true = self._create_test_data()
+    # Disable autograph for correct coverage statistics.
+    update_fn = tf.autograph.experimental.do_not_convert(
+        mean_iou_metric.update_state)
+    update_fn(y_true=y_true, y_pred=y_pred)
+    miou = mean_iou_metric.result()
+    self.assertAlmostEqual(miou.numpy(), 0.762, places=3)
+
+  @parameterized.parameters(True, False)
+  def test_per_class_mean_iou_metric(self, rescale_predictions):
+    per_class_iou_metric = segmentation_metrics.PerClassIoU(
+        num_classes=2, rescale_predictions=rescale_predictions)
+    y_pred, y_true = self._create_test_data()
+    # Disable autograph for correct coverage statistics.
+    update_fn = tf.autograph.experimental.do_not_convert(
+        per_class_iou_metric.update_state)
+    update_fn(y_true=y_true, y_pred=y_pred)
+    per_class_miou = per_class_iou_metric.result()
+    self.assertAllClose(per_class_miou.numpy(), [0.857, 0.667], atol=1e-3)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/evaluation/wod_detection_evaluator.py
+++ b/official/vision/evaluation/wod_detection_evaluator.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""2D detection evaluator for the Waymo Open Dataset."""
+import pprint
+from absl import logging
+
+import tensorflow as tf
+from official.vision.ops import box_ops
+from waymo_open_dataset import label_pb2
+from waymo_open_dataset.metrics.python import wod_detection_evaluator
+from waymo_open_dataset.protos import breakdown_pb2
+from waymo_open_dataset.protos import metrics_pb2
+
+
+def get_2d_detection_default_config():
+  """Returns the config proto for WOD 2D detection Evaluation."""
+  config = metrics_pb2.Config()
+
+  config.breakdown_generator_ids.append(breakdown_pb2.Breakdown.OBJECT_TYPE)
+  difficulty = config.difficulties.add()
+  difficulty.levels.append(label_pb2.Label.LEVEL_1)
+  difficulty.levels.append(label_pb2.Label.LEVEL_2)
+  config.breakdown_generator_ids.append(breakdown_pb2.Breakdown.ALL_BUT_SIGN)
+  difficulty = config.difficulties.add()
+  difficulty.levels.append(label_pb2.Label.LEVEL_1)
+  difficulty.levels.append(label_pb2.Label.LEVEL_2)
+  config.matcher_type = metrics_pb2.MatcherProto.TYPE_HUNGARIAN
+  config.iou_thresholds.append(0.0)
+  config.iou_thresholds.append(0.7)
+  config.iou_thresholds.append(0.5)
+  config.iou_thresholds.append(0.5)
+  config.iou_thresholds.append(0.5)
+  config.box_type = label_pb2.Label.Box.TYPE_2D
+
+  for i in range(100):
+    config.score_cutoffs.append(i * 0.01)
+  config.score_cutoffs.append(1.0)
+
+  return config
+
+
+class WOD2dDetectionEvaluator(wod_detection_evaluator.WODDetectionEvaluator):
+  """WOD 2D detection evaluation metric class."""
+
+  def __init__(self, config=None):
+    if config is None:
+      config = get_2d_detection_default_config()
+    super().__init__(config=config)
+
+  def _remove_padding(self, tensor_dict, num_valid):
+    """Remove the paddings of the prediction/groundtruth data."""
+    result_tensor_dict = {}
+    gather_indices = tf.range(num_valid)
+    for k, v in tensor_dict.items():
+      if 'frame_id' in k:
+        result_tensor_dict[k] = tf.tile([v], [num_valid])
+      else:
+        result_tensor_dict[k] = tf.gather(v, gather_indices)
+    return result_tensor_dict
+
+  def update_state(self, groundtruths, predictions):
+    """Update the metrics state with prediction and groundtruth data.
+
+    Args:
+      groundtruths: a dictionary of Tensors including the fields below.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - num_detections: a numpy array of int of shape [batch_size].
+          - boxes: a numpy array of float of shape [batch_size, K, 4].
+          - classes: a numpy array of int of shape [batch_size, K].
+          - difficulties: a numpy array of int of shape [batch_size, K].
+
+      predictions: a dictionary of tensors including the fields below.
+        Required fields:
+          - source_id: a numpy array of int or string of shape [batch_size].
+          - image_info: a numpy array of float of shape [batch_size, 4, 2].
+          - num_detections: a numpy array of int of shape [batch_size].
+          - detection_boxes: a numpy array of float of shape [batch_size, K, 4].
+          - detection_classes: a numpy array of int of shape [batch_size, K].
+          - detection_scores: a numpy array of float of shape [batch_size, K].
+    """
+    # Preprocess potentially aggregated tensors.
+    for k, v in groundtruths.items():
+      if isinstance(v, tuple):
+        groundtruths[k] = tf.concat(v, axis=0)
+    for k, v in predictions.items():
+      if isinstance(v, tuple):
+        predictions[k] = tf.concat(v, axis=0)
+
+    # Change cyclists' type id from 3 to 4, where 3 is reserved for sign.
+    groundtruth_type = tf.cast(groundtruths['classes'], tf.uint8)
+    groundtruth_type = tf.where(
+        tf.equal(groundtruth_type, 3),
+        tf.ones_like(groundtruth_type) * 4, groundtruth_type)
+    prediction_type = tf.cast(predictions['detection_classes'], tf.uint8)
+    prediction_type = tf.where(
+        tf.equal(prediction_type, 3),
+        tf.ones_like(prediction_type) * 4, prediction_type)
+
+    # Rescale the detection boxes back to original scale.
+    image_scale = tf.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2))
+    prediction_bbox = predictions['detection_boxes'] / image_scale
+
+    batch_size = tf.shape(groundtruths['source_id'])[0]
+
+    for i in tf.range(batch_size):
+      frame_groundtruths = {
+          'ground_truth_frame_id':
+              groundtruths['source_id'][i],
+          'ground_truth_bbox':
+              box_ops.yxyx_to_cycxhw(
+                  tf.cast(groundtruths['boxes'][i], tf.float32)),
+          'ground_truth_type':
+              groundtruth_type[i],
+          'ground_truth_difficulty':
+              tf.cast(groundtruths['difficulties'][i], tf.uint8),
+      }
+      frame_groundtruths = self._remove_padding(
+          frame_groundtruths, groundtruths['num_detections'][i])
+      frame_predictions = {
+          'prediction_frame_id':
+              groundtruths['source_id'][i],
+          'prediction_bbox':
+              box_ops.yxyx_to_cycxhw(
+                  tf.cast(prediction_bbox[i], tf.float32)),
+          'prediction_type':
+              prediction_type[i],
+          'prediction_score':
+              tf.cast(predictions['detection_scores'][i], tf.float32),
+          'prediction_overlap_nlz':
+              tf.zeros_like(predictions['detection_scores'][i], dtype=tf.bool)
+      }
+      frame_predictions = self._remove_padding(frame_predictions,
+                                               predictions['num_detections'][i])
+      super().update_state(frame_groundtruths, frame_predictions)
+
+  def evaluate(self):
+    """Compute the final metrics."""
+    ap, _, _, _, _ = super().evaluate()
+    metric_dict = {}
+    for i, name in enumerate(self._breakdown_names):
+      # Skip sign metrics in 2d detection task.
+      if 'SIGN' in name:
+        continue
+      metric_dict['WOD metrics/{}/AP'.format(name)] = ap[i]
+    pp = pprint.PrettyPrinter()
+    logging.info('WOD Detection Metrics: \n %s', pp.pformat(metric_dict))
+
+    return metric_dict
--- a/official/vision/beta/projects/example/README.md
+++ b/official/vision/beta/projects/example/README.md
@@ -35,14 +35,14 @@ input and return an `ExampleModel` instance, similar as
 As a simple example, we define a single model. However, you can split the model
 implementation to individual components, such as backbones, decoders, heads, as
 what we do
-[here](https://github.com/tensorflow/models/blob/master/official/vision/beta/modeling).
+[here](https://github.com/tensorflow/models/blob/master/official/vision/modeling).
 And then in `build_example_model` function, you can hook up these components
 together to obtain your full model.

 ## Create Dataloader

 A dataloader reads, decodes and parses the input data. We have created various
-[dataloaders](https://github.com/tensorflow/models/blob/master/official/vision/beta/dataloaders)
+[dataloaders](https://github.com/tensorflow/models/blob/master/official/vision/dataloaders)
 to handle standard input formats for classification, detection and segmentation.
 If you have non-standard or complex data, you may want to create your own
 dataloader. It contains a `Decoder` and a `Parser`.
@@ -123,10 +123,10 @@ together and is called by the base
 You can create your own task by inheriting from base
 [Task](https://github.com/tensorflow/models/blob/master/official/core/base_task.py),
 or from one of the
-[tasks](https://github.com/tensorflow/models/blob/master/official/vision/beta/tasks/)
+[tasks](https://github.com/tensorflow/models/blob/master/official/vision/tasks/)
 we already defined, if most of the operations can be reused. An `ExampleTask`
 inheriting from
-[ImageClassificationTask](https://github.com/tensorflow/models/blob/master/official/vision/beta/tasks/image_classification.py#L32)
+[ImageClassificationTask](https://github.com/tensorflow/models/blob/master/official/vision/tasks/image_classification.py#L32)
 can be found
 [here](example_task.py).
 We will go through each important components in the task in the following.
@@ -175,7 +175,7 @@ from official.vision.beta.projects.example import example_task
 ## Training

 You can create your own trainer by branching from our core
-[trainer](https://github.com/tensorflow/models/blob/master/official/vision/beta/train.py).
+[trainer](https://github.com/tensorflow/models/blob/master/official/vision/train.py).
 Just make sure you import the registry like this:

 ```python
@@ -185,7 +185,7 @@ from official.vision.beta.projects.example import registry_imports  # pylint: di
 You can run training locally for testing purpose:

 ```bash
-# Assume you are under official/vision/beta/projects.
+# Assume you are under official/vision/projects.
 python3 example/train.py \
  --experiment=tf_vision_example_experiment \
  --config_file=${PWD}/example/example_config_local.yaml \
@@ -210,5 +210,5 @@ python3 example/train.py \
  --mode=train \
  --tpu=$TPU_NAME \
  --model_dir=/tmp/tfvision_test/
-  --config_file=third_party/tensorflow_models/official/vision/beta/projects/example/example_config_tpu.yaml
+  --config_file=third_party/tensorflow_models/official/vision/examples/starter/example_config_tpu.yaml
 ```
--- a/official/vision/beta/projects/example/example_config.py
+++ b/official/vision/beta/projects/example/example_config.py
@@ -13,9 +13,8 @@
 # limitations under the License.

 """Example experiment configuration definition."""
-from typing import List
-
 import dataclasses
+from typing import List

 from official.core import config_definitions as cfg
 from official.core import exp_factory

--- a/official/vision/beta/projects/example/example_config_local.yaml
+++ b/official/vision/beta/projects/example/example_config_local.yaml
--- a/official/vision/beta/projects/example/example_config_tpu.yaml
+++ b/official/vision/beta/projects/example/example_config_tpu.yaml
--- a/official/vision/beta/projects/example/example_input.py
+++ b/official/vision/beta/projects/example/example_input.py
@@ -22,9 +22,9 @@ from typing import Mapping, List, Tuple
 # Import libraries
 import tensorflow as tf

-from official.vision.beta.dataloaders import decoder
-from official.vision.beta.dataloaders import parser
-from official.vision.beta.ops import preprocess_ops
+from official.vision.dataloaders import decoder
+from official.vision.dataloaders import parser
+from official.vision.ops import preprocess_ops

 MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
 STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)

--- a/official/vision/beta/projects/example/example_model.py
+++ b/official/vision/beta/projects/example/example_model.py
@@ -22,7 +22,7 @@ directly used from `official/vision/beta/modeling` directory.
 from typing import Any, Mapping
 # Import libraries
 import tensorflow as tf
-from official.vision.beta.projects.example import example_config as example_cfg
+from official.vision.examples.starter import example_config as example_cfg


 class ExampleModel(tf.keras.Model):

--- a/official/vision/beta/projects/example/example_task.py
+++ b/official/vision/beta/projects/example/example_task.py
@@ -21,10 +21,10 @@ from official.common import dataset_fn
 from official.core import base_task
 from official.core import task_factory
 from official.modeling import tf_utils
-from official.vision.beta.dataloaders import input_reader_factory
-from official.vision.beta.projects.example import example_config as exp_cfg
-from official.vision.beta.projects.example import example_input
-from official.vision.beta.projects.example import example_model
+from official.vision.dataloaders import input_reader_factory
+from official.vision.examples.starter import example_config as exp_cfg
+from official.vision.examples.starter import example_input
+from official.vision.examples.starter import example_model


 @task_factory.register_task_cls(exp_cfg.ExampleTask)

--- a/official/vision/beta/projects/example/registry_imports.py
+++ b/official/vision/beta/projects/example/registry_imports.py
@@ -21,7 +21,7 @@ to handle each file separately.

 # pylint: disable=unused-import
 from official.common import registry_imports
-from official.vision.beta.projects.example import example_config
-from official.vision.beta.projects.example import example_input
-from official.vision.beta.projects.example import example_model
-from official.vision.beta.projects.example import example_task
+from official.vision.examples.starter import example_config
+from official.vision.examples.starter import example_input
+from official.vision.examples.starter import example_model
+from official.vision.examples.starter import example_task