Merge branch 'master' of https://github.com/tensorflow/models into detection_generator_pr_2

b92025a9 · anivegesana · 1b425791 · 37536370 · b92025a9 · b92025a9
Commit b92025a9 authored Aug 18, 2021 by anivegesana
20 changed files
--- a/official/recommendation/ranking/train.py
+++ b/official/recommendation/ranking/train.py
@@ -43,11 +43,6 @@ class RankingTrainer(base_trainer.Trainer):
  def train_loop_end(self) -> Dict[str, float]:
    """See base class."""
    self.join()
-    # Checks if the model numeric status is stable and conducts the checkpoint
-    # recovery accordingly.
-    if self._recovery:
-      self._recovery.maybe_recover(self.train_loss.result().numpy(),
-                                   self.global_step.numpy())
    logs = {}
    for metric in self.train_metrics + [self.train_loss]:
      logs[metric.name] = metric.result()

--- a/official/vision/beta/configs/decoders.py
+++ b/official/vision/beta/configs/decoders.py
@@ -50,6 +50,7 @@ class ASPP(hyperparams.Config):
  dilation_rates: List[int] = dataclasses.field(default_factory=list)
  dropout_rate: float = 0.0
  num_filters: int = 256
+  use_depthwise_convolution: bool = False
  pool_kernel_size: Optional[List[int]] = None  # Use global average pooling.



--- a/official/vision/beta/configs/retinanet.py
+++ b/official/vision/beta/configs/retinanet.py
@@ -55,6 +55,7 @@ class Parser(hyperparams.Config):
  aug_rand_hflip: bool = False
  aug_scale_min: float = 1.0
  aug_scale_max: float = 1.0
+  aug_policy: Optional[str] = None
  skip_crowd_during_training: bool = True
  max_num_instances: int = 100


--- a/official/vision/beta/configs/semantic_segmentation.py
+++ b/official/vision/beta/configs/semantic_segmentation.py
@@ -60,6 +60,7 @@ class SegmentationHead(hyperparams.Config):
  level: int = 3
  num_convs: int = 2
  num_filters: int = 256
+  use_depthwise_convolution: bool = False
  prediction_kernel_size: int = 1
  upsample_factor: int = 1
  feature_fusion: Optional[str] = None  # None, deeplabv3plus, or pyramid_fusion

--- a/official/vision/beta/data/process_coco_few_shot.sh
+++ b/official/vision/beta/data/process_coco_few_shot.sh
@@ -15,7 +15,7 @@ done

 cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
 wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
-    -P "${tmp_dir}" -A "trainvalno5k.json,5k.json,*10shot*.json,*30shot*.json" \
+    -P "${tmp_dir}" -A "trainvalno5k.json,5k.json,*1shot*.json,*3shot*.json,*5shot*.json,*10shot*.json,*30shot*.json" \
    "http://${cocosplit_url}/"
 mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
 rm -rf "${tmp_dir}/${cocosplit_url}/"
@@ -24,7 +24,7 @@ python process_coco_few_shot_json_files.py \
    --logtostderr --workdir="${tmp_dir}"

 for seed in {0..9}; do
-  for shots in 10 30; do
+  for shots in 1 3 5 10 30; do
    python create_coco_tf_record.py \
        --logtostderr \
        --image_dir="${base_image_dir}/train2014" \

--- a/official/vision/beta/data/process_coco_few_shot_json_files.py
+++ b/official/vision/beta/data/process_coco_few_shot_json_files.py
@@ -53,7 +53,7 @@ CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
              'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
              'wine glass', 'zebra']
 SEEDS = list(range(10))
-SHOTS = [10, 30]
+SHOTS = [1, 3, 5, 10, 30]

 FILE_SUFFIXES = collections.defaultdict(list)
 for _seed, _shots in itertools.product(SEEDS, SHOTS):

--- a/official/vision/beta/evaluation/coco_utils.py
+++ b/official/vision/beta/evaluation/coco_utils.py
@@ -131,7 +131,6 @@ def convert_predictions_to_coco_annotations(predictions):
  """
  coco_predictions = []
  num_batches = len(predictions['source_id'])
-  batch_size = predictions['source_id'][0].shape[0]
  max_num_detections = predictions['detection_classes'][0].shape[1]
  use_outer_box = 'detection_outer_boxes' in predictions
  for i in range(num_batches):
@@ -144,6 +143,7 @@ def convert_predictions_to_coco_annotations(predictions):
    else:
      mask_boxes = predictions['detection_boxes']

+    batch_size = predictions['source_id'][i].shape[0]
    for j in range(batch_size):
      if 'detection_masks' in predictions:
        image_masks = mask_ops.paste_instance_masks(
@@ -211,9 +211,9 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):

  gt_annotations = []
  num_batches = len(groundtruths['source_id'])
-  batch_size = groundtruths['source_id'][0].shape[0]
  for i in range(num_batches):
    max_num_instances = groundtruths['classes'][i].shape[1]
+    batch_size = groundtruths['source_id'][i].shape[0]
    for j in range(batch_size):
      num_instances = groundtruths['num_detections'][i][j]
      if num_instances > max_num_instances:

--- a/official/vision/beta/evaluation/panoptic_quality.py
+++ b/official/vision/beta/evaluation/panoptic_quality.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implementation of the Panoptic Quality metric.
+
+Panoptic Quality is an instance-based metric for evaluating the task of
+image parsing, aka panoptic segmentation.
+
+Please see the paper for details:
+"Panoptic Segmentation", Alexander Kirillov, Kaiming He, Ross Girshick,
+Carsten Rother and Piotr Dollar. arXiv:1801.00868, 2018.
+
+Note that this metric class is branched from
+https://github.com/tensorflow/models/blob/master/research/deeplab/evaluation/panoptic_quality.py
+"""
+
+import collections
+import numpy as np
+
+_EPSILON = 1e-10
+
+
+def realdiv_maybe_zero(x, y):
+  """Element-wise x / y where y may contain zeros, for those returns 0 too."""
+  return np.where(
+      np.less(np.abs(y), _EPSILON), np.zeros_like(x), np.divide(x, y))
+
+
+def _ids_to_counts(id_array):
+  """Given a numpy array, a mapping from each unique entry to its count."""
+  ids, counts = np.unique(id_array, return_counts=True)
+  return dict(zip(ids, counts))
+
+
+class PanopticQuality:
+  """Metric class for Panoptic Quality.
+
+  "Panoptic Segmentation" by Alexander Kirillov, Kaiming He, Ross Girshick,
+  Carsten Rother, Piotr Dollar.
+  https://arxiv.org/abs/1801.00868
+  """
+
+  def __init__(self, num_categories, ignored_label, max_instances_per_category,
+               offset):
+    """Initialization for PanopticQualityMetric.
+
+    Args:
+      num_categories: The number of segmentation categories (or "classes" in the
+        dataset.
+      ignored_label: A category id that is ignored in evaluation, e.g. the void
+        label as defined in COCO panoptic segmentation dataset.
+      max_instances_per_category: The maximum number of instances for each
+        category. Used in ensuring unique instance labels.
+      offset: The maximum number of unique labels. This is used, by multiplying
+        the ground-truth labels, to generate unique ids for individual regions
+        of overlap between groundtruth and predicted segments.
+    """
+    self.num_categories = num_categories
+    self.ignored_label = ignored_label
+    self.max_instances_per_category = max_instances_per_category
+    self.offset = offset
+    self.reset()
+
+  def _naively_combine_labels(self, category_mask, instance_mask):
+    """Naively creates a combined label array from categories and instances."""
+    return (category_mask.astype(np.uint32) * self.max_instances_per_category +
+            instance_mask.astype(np.uint32))
+
+  def compare_and_accumulate(self, groundtruths, predictions):
+    """Compares predicted segmentation with groundtruth, accumulates its metric.
+
+    It is not assumed that instance ids are unique across different categories.
+    See for example combine_semantic_and_instance_predictions.py in official
+    PanopticAPI evaluation code for issues to consider when fusing category
+    and instance labels.
+
+    Instances ids of the ignored category have the meaning that id 0 is "void"
+    and remaining ones are crowd instances.
+
+    Args:
+      groundtruths: A dictionary contains groundtruth labels. It should contain
+        the following fields.
+        - category_mask: A 2D numpy uint16 array of groundtruth per-pixel
+          category labels.
+        - instance_mask: A 2D numpy uint16 array of groundtruth instance labels.
+      predictions: A dictionary contains the model outputs. It should contain
+        the following fields.
+        - category_array: A 2D numpy uint16 array of predicted per-pixel
+          category labels.
+        - instance_array: A 2D numpy uint16 array of predicted instance labels.
+    """
+    groundtruth_category_mask = groundtruths['category_mask']
+    groundtruth_instance_mask = groundtruths['instance_mask']
+    predicted_category_mask = predictions['category_mask']
+    predicted_instance_mask = predictions['instance_mask']
+
+    # First, combine the category and instance labels so that every unique
+    # value for (category, instance) is assigned a unique integer label.
+    pred_segment_id = self._naively_combine_labels(predicted_category_mask,
+                                                   predicted_instance_mask)
+    gt_segment_id = self._naively_combine_labels(groundtruth_category_mask,
+                                                 groundtruth_instance_mask)
+
+    # Pre-calculate areas for all groundtruth and predicted segments.
+    gt_segment_areas = _ids_to_counts(gt_segment_id)
+    pred_segment_areas = _ids_to_counts(pred_segment_id)
+
+    # We assume there is only one void segment and it has instance id = 0.
+    void_segment_id = self.ignored_label * self.max_instances_per_category
+
+    # There may be other ignored groundtruth segments with instance id > 0, find
+    # those ids using the unique segment ids extracted with the area computation
+    # above.
+    ignored_segment_ids = {
+        gt_segment_id for gt_segment_id in gt_segment_areas
+        if (gt_segment_id //
+            self.max_instances_per_category) == self.ignored_label
+    }
+
+    # Next, combine the groundtruth and predicted labels. Dividing up the pixels
+    # based on which groundtruth segment and which predicted segment they belong
+    # to, this will assign a different 32-bit integer label to each choice
+    # of (groundtruth segment, predicted segment), encoded as
+    #   gt_segment_id * offset + pred_segment_id.
+    intersection_id_array = (
+        gt_segment_id.astype(np.uint64) * self.offset +
+        pred_segment_id.astype(np.uint64))
+
+    # For every combination of (groundtruth segment, predicted segment) with a
+    # non-empty intersection, this counts the number of pixels in that
+    # intersection.
+    intersection_areas = _ids_to_counts(intersection_id_array)
+
+    # Helper function that computes the area of the overlap between a predicted
+    # segment and the ground-truth void/ignored segment.
+    def prediction_void_overlap(pred_segment_id):
+      void_intersection_id = void_segment_id * self.offset + pred_segment_id
+      return intersection_areas.get(void_intersection_id, 0)
+
+    # Compute overall ignored overlap.
+    def prediction_ignored_overlap(pred_segment_id):
+      total_ignored_overlap = 0
+      for ignored_segment_id in ignored_segment_ids:
+        intersection_id = ignored_segment_id * self.offset + pred_segment_id
+        total_ignored_overlap += intersection_areas.get(intersection_id, 0)
+      return total_ignored_overlap
+
+    # Sets that are populated with which segments groundtruth/predicted segments
+    # have been matched with overlapping predicted/groundtruth segments
+    # respectively.
+    gt_matched = set()
+    pred_matched = set()
+
+    # Calculate IoU per pair of intersecting segments of the same category.
+    for intersection_id, intersection_area in intersection_areas.items():
+      gt_segment_id = int(intersection_id // self.offset)
+      pred_segment_id = int(intersection_id % self.offset)
+
+      gt_category = int(gt_segment_id // self.max_instances_per_category)
+      pred_category = int(pred_segment_id // self.max_instances_per_category)
+      if gt_category != pred_category:
+        continue
+
+      # Union between the groundtruth and predicted segments being compared does
+      # not include the portion of the predicted segment that consists of
+      # groundtruth "void" pixels.
+      union = (
+          gt_segment_areas[gt_segment_id] +
+          pred_segment_areas[pred_segment_id] - intersection_area -
+          prediction_void_overlap(pred_segment_id))
+      iou = intersection_area / union
+      if iou > 0.5:
+        self.tp_per_class[gt_category] += 1
+        self.iou_per_class[gt_category] += iou
+        gt_matched.add(gt_segment_id)
+        pred_matched.add(pred_segment_id)
+
+    # Count false negatives for each category.
+    for gt_segment_id in gt_segment_areas:
+      if gt_segment_id in gt_matched:
+        continue
+      category = gt_segment_id // self.max_instances_per_category
+      # Failing to detect a void segment is not a false negative.
+      if category == self.ignored_label:
+        continue
+      self.fn_per_class[category] += 1
+
+    # Count false positives for each category.
+    for pred_segment_id in pred_segment_areas:
+      if pred_segment_id in pred_matched:
+        continue
+      # A false positive is not penalized if is mostly ignored in the
+      # groundtruth.
+      if (prediction_ignored_overlap(pred_segment_id) /
+          pred_segment_areas[pred_segment_id]) > 0.5:
+        continue
+      category = pred_segment_id // self.max_instances_per_category
+      self.fp_per_class[category] += 1
+
+  def _valid_categories(self):
+    """Categories with a "valid" value for the metric, have > 0 instances.
+
+    We will ignore the `ignore_label` class and other classes which have
+    `tp + fn + fp = 0`.
+
+    Returns:
+      Boolean array of shape `[num_categories]`.
+    """
+    valid_categories = np.not_equal(
+        self.tp_per_class + self.fn_per_class + self.fp_per_class, 0)
+    if self.ignored_label >= 0 and self.ignored_label < self.num_categories:
+      valid_categories[self.ignored_label] = False
+    return valid_categories
+
+  def result_per_category(self):
+    """For supported metrics, return individual per-category metric values.
+
+    Returns:
+      A dictionary contains all per-class metrics, each metrics is a numpy array
+      of shape `[self.num_categories]`, where index `i` is the metrics value
+      over only that category.
+    """
+    sq_per_class = realdiv_maybe_zero(self.iou_per_class, self.tp_per_class)
+    rq_per_class = realdiv_maybe_zero(
+        self.tp_per_class,
+        self.tp_per_class + 0.5 * self.fn_per_class + 0.5 * self.fp_per_class)
+    return {
+        'sq_per_class': sq_per_class,
+        'rq_per_class': rq_per_class,
+        'pq_per_class': np.multiply(sq_per_class, rq_per_class)
+    }
+
+  def result(self, is_thing=None):
+    """Computes and returns the detailed metric results over all comparisons.
+
+    Args:
+      is_thing: A boolean array of length `num_categories`. The entry
+        `is_thing[category_id]` is True iff that category is a "thing" category
+        instead of "stuff."
+
+    Returns:
+      A dictionary with a breakdown of metrics and/or metric factors by things,
+      stuff, and all categories.
+    """
+    results = self.result_per_category()
+    valid_categories = self._valid_categories()
+    # If known, break down which categories are valid _and_ things/stuff.
+    category_sets = collections.OrderedDict()
+    category_sets['All'] = valid_categories
+    if is_thing is not None:
+      category_sets['Things'] = np.logical_and(valid_categories, is_thing)
+      category_sets['Stuff'] = np.logical_and(valid_categories,
+                                              np.logical_not(is_thing))
+
+    for category_set_name, in_category_set in category_sets.items():
+      if np.any(in_category_set):
+        results.update({
+            f'{category_set_name}_pq':
+                np.mean(results['pq_per_class'][in_category_set]),
+            f'{category_set_name}_sq':
+                np.mean(results['sq_per_class'][in_category_set]),
+            f'{category_set_name}_rq':
+                np.mean(results['rq_per_class'][in_category_set]),
+            # The number of categories in this subset.
+            f'{category_set_name}_num_categories':
+                np.sum(in_category_set.astype(np.int32)),
+        })
+      else:
+        results[category_set_name] = {
+            f'{category_set_name}_pq': 0.,
+            f'{category_set_name}_sq': 0.,
+            f'{category_set_name}_rq': 0.,
+            f'{category_set_name}_num_categories': 0
+        }
+
+    return results
+
+  def reset(self):
+    """Resets the accumulation to the metric class's state at initialization."""
+    self.iou_per_class = np.zeros(self.num_categories, dtype=np.float64)
+    self.tp_per_class = np.zeros(self.num_categories, dtype=np.float64)
+    self.fn_per_class = np.zeros(self.num_categories, dtype=np.float64)
+    self.fp_per_class = np.zeros(self.num_categories, dtype=np.float64)
--- a/official/vision/beta/evaluation/panoptic_quality_evaluator.py
+++ b/official/vision/beta/evaluation/panoptic_quality_evaluator.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The panoptic quality evaluator.
+
+The following snippet demonstrates the use of interfaces:
+
+  evaluator = PanopticQualityEvaluator(...)
+  for _ in range(num_evals):
+    for _ in range(num_batches_per_eval):
+      predictions, groundtruth = predictor.predict(...)  # pop a batch.
+      evaluator.update_state(groundtruths, predictions)
+    evaluator.result()  # finish one full eval and reset states.
+
+See also: https://github.com/cocodataset/cocoapi/
+"""
+
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.evaluation import panoptic_quality
+
+
+class PanopticQualityEvaluator:
+  """Panoptic Quality metric class."""
+
+  def __init__(self, num_categories, ignored_label, max_instances_per_category,
+               offset, is_thing=None):
+    """Constructs Panoptic Quality evaluation class.
+
+    The class provides the interface to Panoptic Quality metrics_fn.
+
+    Args:
+      num_categories: The number of segmentation categories (or "classes" in the
+        dataset.
+      ignored_label: A category id that is ignored in evaluation, e.g. the void
+        label as defined in COCO panoptic segmentation dataset.
+      max_instances_per_category: The maximum number of instances for each
+        category. Used in ensuring unique instance labels.
+      offset: The maximum number of unique labels. This is used, by multiplying
+        the ground-truth labels, to generate unique ids for individual regions
+        of overlap between groundtruth and predicted segments.
+      is_thing: A boolean array of length `num_categories`. The entry
+        `is_thing[category_id]` is True iff that category is a "thing" category
+        instead of "stuff." Default to `None`, and it means categories are not
+        classified into these two categories.
+    """
+    self._pq_metric_module = panoptic_quality.PanopticQuality(
+        num_categories, ignored_label, max_instances_per_category, offset)
+    self._is_thing = is_thing
+    self._required_prediction_fields = ['category_mask', 'instance_mask']
+    self._required_groundtruth_fields = ['category_mask', 'instance_mask']
+    self.reset_states()
+
+  @property
+  def name(self):
+    return 'panoptic_quality'
+
+  def reset_states(self):
+    """Resets internal states for a fresh run."""
+    self._pq_metric_module.reset()
+
+  def result(self):
+    """Evaluates detection results, and reset_states."""
+    results = self._pq_metric_module.result(self._is_thing)
+    self.reset_states()
+    return results
+
+  def _convert_to_numpy(self, groundtruths, predictions):
+    """Converts tesnors to numpy arrays."""
+    if groundtruths:
+      labels = tf.nest.map_structure(lambda x: x.numpy(), groundtruths)
+      numpy_groundtruths = {}
+      for key, val in labels.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_groundtruths[key] = val
+    else:
+      numpy_groundtruths = groundtruths
+
+    if predictions:
+      outputs = tf.nest.map_structure(lambda x: x.numpy(), predictions)
+      numpy_predictions = {}
+      for key, val in outputs.items():
+        if isinstance(val, tuple):
+          val = np.concatenate(val)
+        numpy_predictions[key] = val
+    else:
+      numpy_predictions = predictions
+
+    return numpy_groundtruths, numpy_predictions
+
+  def update_state(self, groundtruths, predictions):
+    """Update and aggregate detection results and groundtruth data.
+
+    Args:
+      groundtruths: a dictionary of Tensors including the fields below. See also
+        different parsers under `../dataloader` for more details.
+        Required fields:
+          - category_mask: a numpy array of uint16 of shape [batch_size, H, W].
+          - instance_mask: a numpy array of uint16 of shape [batch_size, H, W].
+      predictions: a dictionary of tensors including the fields below. See
+        different parsers under `../dataloader` for more details.
+        Required fields:
+          - category_mask: a numpy array of uint16 of shape [batch_size, H, W].
+          - instance_mask: a numpy array of uint16 of shape [batch_size, H, W].
+
+    Raises:
+      ValueError: if the required prediction or groundtruth fields are not
+        present in the incoming `predictions` or `groundtruths`.
+    """
+    groundtruths, predictions = self._convert_to_numpy(groundtruths,
+                                                       predictions)
+    for k in self._required_prediction_fields:
+      if k not in predictions:
+        raise ValueError(
+            'Missing the required key `{}` in predictions!'.format(k))
+
+    for k in self._required_groundtruth_fields:
+      if k not in groundtruths:
+        raise ValueError(
+            'Missing the required key `{}` in groundtruths!'.format(k))
+
+    self._pq_metric_module.compare_and_accumulate(groundtruths, predictions)
--- a/official/vision/beta/evaluation/panoptic_quality_evaluator_test.py
+++ b/official/vision/beta/evaluation/panoptic_quality_evaluator_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for panoptic_quality_evaluator."""
+
+import numpy as np
+import tensorflow as tf
+
+from official.vision.beta.evaluation import panoptic_quality_evaluator
+
+
+class PanopticQualityEvaluatorTest(tf.test.TestCase):
+
+  def test_multiple_batches(self):
+    category_mask = np.zeros([6, 6], np.uint16)
+    groundtruth_instance_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 2, 2, 2, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                                         dtype=np.uint16)
+
+    good_det_instance_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 2, 2, 2, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                                      dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask': tf.convert_to_tensor(category_mask),
+        'instance_mask': tf.convert_to_tensor(groundtruth_instance_mask)
+    }
+    predictions = {
+        'category_mask': tf.convert_to_tensor(category_mask),
+        'instance_mask': tf.convert_to_tensor(good_det_instance_mask)
+    }
+
+    pq_evaluator = panoptic_quality_evaluator.PanopticQualityEvaluator(
+        num_categories=1,
+        ignored_label=2,
+        max_instances_per_category=16,
+        offset=16)
+    for _ in range(2):
+      pq_evaluator.update_state(groundtruths, predictions)
+
+    bad_det_instance_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 2, 2, 1],
+        [1, 1, 1, 2, 2, 1],
+        [1, 1, 1, 2, 2, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                                     dtype=np.uint16)
+    predictions['instance_mask'] = tf.convert_to_tensor(bad_det_instance_mask)
+    for _ in range(2):
+      pq_evaluator.update_state(groundtruths, predictions)
+
+    results = pq_evaluator.result()
+    np.testing.assert_array_equal(results['pq_per_class'],
+                                  [((28 / 30 + 6 / 8) + (27 / 32)) / 2 / 2])
+    np.testing.assert_array_equal(results['rq_per_class'], [3 / 4])
+    np.testing.assert_array_equal(results['sq_per_class'],
+                                  [((28 / 30 + 6 / 8) + (27 / 32)) / 3])
+    self.assertAlmostEqual(results['All_pq'], 0.63177083)
+    self.assertAlmostEqual(results['All_rq'], 0.75)
+    self.assertAlmostEqual(results['All_sq'], 0.84236111)
+    self.assertEqual(results['All_num_categories'], 1)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/evaluation/panoptic_quality_test.py
+++ b/official/vision/beta/evaluation/panoptic_quality_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Panoptic Quality metric.
+
+Note that this metric test class is branched from
+https://github.com/tensorflow/models/blob/master/research/deeplab/evaluation/panoptic_quality_test.py
+"""
+
+
+from absl.testing import absltest
+import numpy as np
+
+from official.vision.beta.evaluation import panoptic_quality
+
+
+class PanopticQualityTest(absltest.TestCase):
+
+  def test_perfect_match(self):
+    category_mask = np.zeros([6, 6], np.uint16)
+    instance_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 2, 2, 2, 2, 1],
+        [1, 2, 2, 1, 1, 1],
+        [1, 2, 1, 1, 1, 1],
+    ],
+                             dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask': category_mask,
+        'instance_mask': instance_mask
+    }
+    predictions = {
+        'category_mask': category_mask,
+        'instance_mask': instance_mask
+    }
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=1,
+        ignored_label=2,
+        max_instances_per_category=16,
+        offset=16)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    np.testing.assert_array_equal(pq_metric.iou_per_class, [2.0])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [2])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [0])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [0])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [1.0])
+    np.testing.assert_array_equal(results['rq_per_class'], [1.0])
+    np.testing.assert_array_equal(results['sq_per_class'], [1.0])
+    self.assertAlmostEqual(results['All_pq'], 1.0)
+    self.assertAlmostEqual(results['All_rq'], 1.0)
+    self.assertAlmostEqual(results['All_sq'], 1.0)
+    self.assertEqual(results['All_num_categories'], 1)
+
+  def test_totally_wrong(self):
+    category_mask = np.array([
+        [0, 0, 0, 0, 0, 0],
+        [0, 1, 0, 0, 1, 0],
+        [0, 1, 1, 1, 1, 0],
+        [0, 1, 1, 1, 1, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+    ],
+                             dtype=np.uint16)
+    instance_mask = np.zeros([6, 6], np.uint16)
+
+    groundtruths = {
+        'category_mask': category_mask,
+        'instance_mask': instance_mask
+    }
+    predictions = {
+        'category_mask': 1 - category_mask,
+        'instance_mask': instance_mask
+    }
+
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=2,
+        ignored_label=2,
+        max_instances_per_category=1,
+        offset=16)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+    np.testing.assert_array_equal(pq_metric.iou_per_class, [0.0, 0.0])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [0, 0])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [1, 1])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [1, 1])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [0.0, 0.0])
+    np.testing.assert_array_equal(results['rq_per_class'], [0.0, 0.0])
+    np.testing.assert_array_equal(results['sq_per_class'], [0.0, 0.0])
+    self.assertAlmostEqual(results['All_pq'], 0.0)
+    self.assertAlmostEqual(results['All_rq'], 0.0)
+    self.assertAlmostEqual(results['All_sq'], 0.0)
+    self.assertEqual(results['All_num_categories'], 2)
+
+  def test_matches_by_iou(self):
+    groundtruth_instance_mask = np.array(
+        [
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 2, 2, 2, 1],
+            [1, 2, 2, 2, 2, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+        ],
+        dtype=np.uint16)
+
+    good_det_instance_mask = np.array(
+        [
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 2, 2, 2, 2, 1],
+            [1, 2, 2, 2, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+        ],
+        dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask': np.zeros_like(groundtruth_instance_mask),
+        'instance_mask': groundtruth_instance_mask
+    }
+    predictions = {
+        'category_mask': np.zeros_like(good_det_instance_mask),
+        'instance_mask': good_det_instance_mask
+    }
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=1,
+        ignored_label=2,
+        max_instances_per_category=16,
+        offset=16)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    # iou(1, 1) = 28/30
+    # iou(2, 2) = 6 / 8
+    np.testing.assert_array_almost_equal(pq_metric.iou_per_class,
+                                         [28 / 30 + 6 / 8])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [2])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [0])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [0])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'],
+                                  [(28 / 30 + 6 / 8) / 2])
+    np.testing.assert_array_equal(results['rq_per_class'], [1.0])
+    np.testing.assert_array_equal(results['sq_per_class'],
+                                  [(28 / 30 + 6 / 8) / 2])
+    self.assertAlmostEqual(results['All_pq'], (28 / 30 + 6 / 8) / 2)
+    self.assertAlmostEqual(results['All_rq'], 1.0)
+    self.assertAlmostEqual(results['All_sq'], (28 / 30 + 6 / 8) / 2)
+    self.assertEqual(results['All_num_categories'], 1)
+
+    bad_det_instance_mask = np.array(
+        [
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 1, 1, 1],
+            [1, 1, 1, 2, 2, 1],
+            [1, 1, 1, 2, 2, 1],
+            [1, 1, 1, 2, 2, 1],
+            [1, 1, 1, 1, 1, 1],
+        ],
+        dtype=np.uint16)
+    predictions['instance_mask'] = bad_det_instance_mask
+
+    pq_metric.reset()
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    # iou(1, 1) = 27/32
+    np.testing.assert_array_almost_equal(pq_metric.iou_per_class, [27 / 32])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [1])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [1])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [1])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [27 / 32 / 2])
+    np.testing.assert_array_equal(results['rq_per_class'], [0.5])
+    np.testing.assert_array_equal(results['sq_per_class'], [27 / 32])
+    self.assertAlmostEqual(results['All_pq'], 27 / 32 / 2)
+    self.assertAlmostEqual(results['All_rq'], 0.5)
+    self.assertAlmostEqual(results['All_sq'], 27 / 32)
+    self.assertEqual(results['All_num_categories'], 1)
+
+  def test_wrong_instances(self):
+    category_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 2, 2, 1, 2, 2],
+        [1, 2, 2, 1, 2, 2],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                             dtype=np.uint16)
+    groundtruth_instance_mask = np.zeros([6, 6], dtype=np.uint16)
+    predicted_instance_mask = np.array([
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+    ],
+                                       dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask': category_mask,
+        'instance_mask': groundtruth_instance_mask
+    }
+    predictions = {
+        'category_mask': category_mask,
+        'instance_mask': predicted_instance_mask
+    }
+
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=3,
+        ignored_label=0,
+        max_instances_per_category=10,
+        offset=100)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    np.testing.assert_array_equal(pq_metric.iou_per_class, [0.0, 1.0, 0.0])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [0, 1, 0])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [0, 0, 1])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [0, 0, 2])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [0.0, 1.0, 0.0])
+    np.testing.assert_array_equal(results['rq_per_class'], [0.0, 1.0, 0.0])
+    np.testing.assert_array_equal(results['sq_per_class'], [0.0, 1.0, 0.0])
+    self.assertAlmostEqual(results['All_pq'], 0.5)
+    self.assertAlmostEqual(results['All_rq'], 0.5)
+    self.assertAlmostEqual(results['All_sq'], 0.5)
+    self.assertEqual(results['All_num_categories'], 2)
+
+  def test_instance_order_is_arbitrary(self):
+    category_mask = np.array([
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+        [1, 2, 2, 1, 2, 2],
+        [1, 2, 2, 1, 2, 2],
+        [1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1],
+    ],
+                             dtype=np.uint16)
+    groundtruth_instance_mask = np.array([
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 1, 1, 0, 0, 0],
+        [0, 1, 1, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+    ],
+                                         dtype=np.uint16)
+    predicted_instance_mask = np.array([
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+    ],
+                                       dtype=np.uint16)
+
+    groundtruths = {
+        'category_mask': category_mask,
+        'instance_mask': groundtruth_instance_mask
+    }
+    predictions = {
+        'category_mask': category_mask,
+        'instance_mask': predicted_instance_mask
+    }
+
+    pq_metric = panoptic_quality.PanopticQuality(
+        num_categories=3,
+        ignored_label=0,
+        max_instances_per_category=10,
+        offset=100)
+    pq_metric.compare_and_accumulate(groundtruths, predictions)
+
+    np.testing.assert_array_equal(pq_metric.iou_per_class, [0.0, 1.0, 2.0])
+    np.testing.assert_array_equal(pq_metric.tp_per_class, [0, 1, 2])
+    np.testing.assert_array_equal(pq_metric.fn_per_class, [0, 0, 0])
+    np.testing.assert_array_equal(pq_metric.fp_per_class, [0, 0, 0])
+    results = pq_metric.result()
+    np.testing.assert_array_equal(results['pq_per_class'], [0.0, 1.0, 1.0])
+    np.testing.assert_array_equal(results['rq_per_class'], [0.0, 1.0, 1.0])
+    np.testing.assert_array_equal(results['sq_per_class'], [0.0, 1.0, 1.0])
+    self.assertAlmostEqual(results['All_pq'], 1.0)
+    self.assertAlmostEqual(results['All_rq'], 1.0)
+    self.assertAlmostEqual(results['All_sq'], 1.0)
+    self.assertEqual(results['All_num_categories'], 2)
+
+
+if __name__ == '__main__':
+  absltest.main()
--- a/official/vision/beta/modeling/backbones/mobilenet.py
+++ b/official/vision/beta/modeling/backbones/mobilenet.py
@@ -342,9 +342,10 @@ Berkin Akin, Suyog Gupta, and Andrew Howard
 """
 MNMultiMAX_BLOCK_SPECS = {
    'spec_name': 'MobileNetMultiMAX',
-    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
-                          'activation', 'expand_ratio',
-                          'use_normalization', 'use_bias', 'is_output'],
+    'block_spec_schema': [
+        'block_fn', 'kernel_size', 'strides', 'filters', 'activation',
+        'expand_ratio', 'use_normalization', 'use_bias', 'is_output'
+    ],
    'block_specs': [
        ('convbn', 3, 2, 32, 'relu', None, True, False, False),
        ('invertedbottleneck', 3, 2, 32, 'relu', 3., None, False, True),
@@ -363,15 +364,18 @@ MNMultiMAX_BLOCK_SPECS = {
        ('invertedbottleneck', 5, 1, 160, 'relu', 4., None, False, True),
        ('convbn', 1, 1, 960, 'relu', None, True, False, False),
        ('gpooling', None, None, None, None, None, None, None, False),
-        ('convbn', 1, 1, 1280, 'relu', None, False, True, False),
+        # Remove bias and add batch norm for the last layer to support QAT
+        # and achieve slightly better accuracy.
+        ('convbn', 1, 1, 1280, 'relu', None, True, False, False),
    ]
 }

 MNMultiAVG_BLOCK_SPECS = {
    'spec_name': 'MobileNetMultiAVG',
-    'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters',
-                          'activation', 'expand_ratio',
-                          'use_normalization', 'use_bias', 'is_output'],
+    'block_spec_schema': [
+        'block_fn', 'kernel_size', 'strides', 'filters', 'activation',
+        'expand_ratio', 'use_normalization', 'use_bias', 'is_output'
+    ],
    'block_specs': [
        ('convbn', 3, 2, 32, 'relu', None, True, False, False),
        ('invertedbottleneck', 3, 2, 32, 'relu', 3., None, False, False),
@@ -392,7 +396,9 @@ MNMultiAVG_BLOCK_SPECS = {
        ('invertedbottleneck', 5, 1, 192, 'relu', 4., None, False, True),
        ('convbn', 1, 1, 960, 'relu', None, True, False, False),
        ('gpooling', None, None, None, None, None, None, None, False),
-        ('convbn', 1, 1, 1280, 'relu', None, False, True, False),
+        # Remove bias and add batch norm for the last layer to support QAT
+        # and achieve slightly better accuracy.
+        ('convbn', 1, 1, 1280, 'relu', None, True, False, False),
    ]
 }


--- a/official/vision/beta/modeling/backbones/mobilenet_test.py
+++ b/official/vision/beta/modeling/backbones/mobilenet_test.py
@@ -158,10 +158,10 @@ class MobileNetTest(parameterized.TestCase, tf.test.TestCase):
        ('MobileNetV3Small', 0.75): 1026552,
        ('MobileNetV3EdgeTPU', 1.0): 2849312,
        ('MobileNetV3EdgeTPU', 0.75): 1737288,
-        ('MobileNetMultiAVG', 1.0): 3700576,
-        ('MobileNetMultiAVG', 0.75): 2345864,
-        ('MobileNetMultiMAX', 1.0): 3170720,
-        ('MobileNetMultiMAX', 0.75): 2041976,
+        ('MobileNetMultiAVG', 1.0): 3704416,
+        ('MobileNetMultiAVG', 0.75): 2349704,
+        ('MobileNetMultiMAX', 1.0): 3174560,
+        ('MobileNetMultiMAX', 0.75): 2045816,
    }

    input_size = 224

--- a/official/vision/beta/modeling/backbones/resnet.py
+++ b/official/vision/beta/modeling/backbones/resnet.py
@@ -32,6 +32,12 @@ layers = tf.keras.layers
 # Each element in the block configuration is in the following format:
 # (block_fn, num_filters, block_repeats)
 RESNET_SPECS = {
+    10: [
+        ('residual', 64, 1),
+        ('residual', 128, 1),
+        ('residual', 256, 1),
+        ('residual', 512, 1),
+    ],
    18: [
        ('residual', 64, 2),
        ('residual', 128, 2),

--- a/official/vision/beta/modeling/backbones/resnet_test.py
+++ b/official/vision/beta/modeling/backbones/resnet_test.py
@@ -28,6 +28,7 @@ from official.vision.beta.modeling.backbones import resnet
 class ResNetTest(parameterized.TestCase, tf.test.TestCase):

  @parameterized.parameters(
+      (128, 10, 1),
      (128, 18, 1),
      (128, 34, 1),
      (128, 50, 4),
@@ -38,6 +39,7 @@ class ResNetTest(parameterized.TestCase, tf.test.TestCase):
                            endpoint_filter_scale):
    """Test creation of ResNet family models."""
    resnet_params = {
+        10: 4915904,
        18: 11190464,
        34: 21306048,
        50: 23561152,

--- a/official/vision/beta/modeling/classification_model_test.py
+++ b/official/vision/beta/modeling/classification_model_test.py
@@ -93,23 +93,6 @@ class ClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
  def test_mobilenet_network_creation(self, mobilenet_model_id,
                                      filter_size_scale):
    """Test for creation of a MobileNet classifier."""
-    mobilenet_params = {
-        ('MobileNetV1', 1.0): 4254889,
-        ('MobileNetV1', 0.75): 2602745,
-        ('MobileNetV2', 1.0): 3540265,
-        ('MobileNetV2', 0.75): 2664345,
-        ('MobileNetV3Large', 1.0): 5508713,
-        ('MobileNetV3Large', 0.75): 4013897,
-        ('MobileNetV3Small', 1.0): 2555993,
-        ('MobileNetV3Small', 0.75): 2052577,
-        ('MobileNetV3EdgeTPU', 1.0): 4131593,
-        ('MobileNetV3EdgeTPU', 0.75): 3019569,
-        ('MobileNetMultiAVG', 1.0): 4982857,
-        ('MobileNetMultiAVG', 0.75): 3628145,
-        ('MobileNetMultiMAX', 1.0): 4453001,
-        ('MobileNetMultiMAX', 0.75): 3324257,
-    }
-
    inputs = np.random.rand(2, 224, 224, 3)

    tf.keras.backend.set_image_data_format('channels_last')
@@ -123,8 +106,6 @@ class ClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
        num_classes=num_classes,
        dropout_rate=0.2,
    )
-    self.assertEqual(model.count_params(),
-                     mobilenet_params[(mobilenet_model_id, filter_size_scale)])

    logits = model(inputs)
    self.assertAllEqual([2, num_classes], logits.numpy().shape)

--- a/official/vision/beta/modeling/decoders/aspp.py
+++ b/official/vision/beta/modeling/decoders/aspp.py
@@ -42,6 +42,7 @@ class ASPP(tf.keras.layers.Layer):
      kernel_initializer: str = 'VarianceScaling',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
      interpolation: str = 'bilinear',
+      use_depthwise_convolution: bool = False,
      **kwargs):
    """Initializes an Atrous Spatial Pyramid Pooling (ASPP) layer.

@@ -64,6 +65,8 @@ class ASPP(tf.keras.layers.Layer):
      interpolation: A `str` of interpolation method. It should be one of
        `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
        `gaussian`, or `mitchellcubic`.
+      use_depthwise_convolution: If True depthwise separable convolutions will
+        be added to the Atrous spatial pyramid pooling.
      **kwargs: Additional keyword arguments to be passed.
    """
    super(ASPP, self).__init__(**kwargs)
@@ -80,6 +83,7 @@ class ASPP(tf.keras.layers.Layer):
        'kernel_initializer': kernel_initializer,
        'kernel_regularizer': kernel_regularizer,
        'interpolation': interpolation,
+        'use_depthwise_convolution': use_depthwise_convolution,
    }

  def build(self, input_shape):
@@ -100,7 +104,9 @@ class ASPP(tf.keras.layers.Layer):
        dropout=self._config_dict['dropout_rate'],
        kernel_initializer=self._config_dict['kernel_initializer'],
        kernel_regularizer=self._config_dict['kernel_regularizer'],
-        interpolation=self._config_dict['interpolation'])
+        interpolation=self._config_dict['interpolation'],
+        use_depthwise_convolution=self._config_dict['use_depthwise_convolution']
+    )

  def call(self, inputs: Mapping[str, tf.Tensor]) -> Mapping[str, tf.Tensor]:
    """Calls the Atrous Spatial Pyramid Pooling (ASPP) layer on an input.
@@ -167,6 +173,7 @@ def build_aspp_decoder(
      level=decoder_cfg.level,
      dilation_rates=decoder_cfg.dilation_rates,
      num_filters=decoder_cfg.num_filters,
+      use_depthwise_convolution=decoder_cfg.use_depthwise_convolution,
      pool_kernel_size=decoder_cfg.pool_kernel_size,
      dropout_rate=decoder_cfg.dropout_rate,
      use_sync_bn=norm_activation_config.use_sync_bn,

--- a/official/vision/beta/modeling/decoders/aspp_test.py
+++ b/official/vision/beta/modeling/decoders/aspp_test.py
@@ -70,6 +70,7 @@ class ASPPTest(parameterized.TestCase, tf.test.TestCase):
        kernel_regularizer=None,
        interpolation='bilinear',
        dropout_rate=0.2,
+        use_depthwise_convolution='false',
    )
    network = aspp.ASPP(**kwargs)


--- a/official/vision/beta/modeling/factory.py
+++ b/official/vision/beta/modeling/factory.py
@@ -76,7 +76,7 @@ def build_maskrcnn(
      backbone_config=model_config.backbone,
      norm_activation_config=norm_activation_config,
      l2_regularizer=l2_regularizer)
-  backbone(tf.keras.Input(input_specs.shape[1:]))
+  backbone_features = backbone(tf.keras.Input(input_specs.shape[1:]))

  decoder = decoders.factory.build_decoder(
      input_specs=backbone.output_specs,
@@ -119,6 +119,13 @@ def build_maskrcnn(
      norm_epsilon=norm_activation_config.norm_epsilon,
      kernel_regularizer=l2_regularizer,
      name='detection_head')
+
+  # Build backbone, decoder and region proposal network:
+
+  if decoder:
+    decoder_features = decoder(backbone_features)
+    rpn_head(decoder_features)
+
  if roi_sampler_config.cascade_iou_thresholds:
    detection_head_cascade = [detection_head]
    for cascade_num in range(len(roi_sampler_config.cascade_iou_thresholds)):
@@ -326,6 +333,7 @@ def build_segmentation_model(
      num_convs=head_config.num_convs,
      prediction_kernel_size=head_config.prediction_kernel_size,
      num_filters=head_config.num_filters,
+      use_depthwise_convolution=head_config.use_depthwise_convolution,
      upsample_factor=head_config.upsample_factor,
      feature_fusion=head_config.feature_fusion,
      low_level=head_config.low_level,

--- a/official/vision/beta/modeling/heads/segmentation_heads.py
+++ b/official/vision/beta/modeling/heads/segmentation_heads.py
@@ -31,6 +31,7 @@ class SegmentationHead(tf.keras.layers.Layer):
      level: Union[int, str],
      num_convs: int = 2,
      num_filters: int = 256,
+      use_depthwise_convolution: bool = False,
      prediction_kernel_size: int = 1,
      upsample_factor: int = 1,
      feature_fusion: Optional[str] = None,
@@ -53,6 +54,8 @@ class SegmentationHead(tf.keras.layers.Layer):
        prediction layer.
      num_filters: An `int` number to specify the number of filters used.
        Default is 256.
+      use_depthwise_convolution: A bool to specify if use depthwise separable
+        convolutions.
      prediction_kernel_size: An `int` number to specify the kernel size of the
      prediction layer.
      upsample_factor: An `int` number to specify the upsampling factor to
@@ -84,6 +87,7 @@ class SegmentationHead(tf.keras.layers.Layer):
        'level': level,
        'num_convs': num_convs,
        'num_filters': num_filters,
+        'use_depthwise_convolution': use_depthwise_convolution,
        'prediction_kernel_size': prediction_kernel_size,
        'upsample_factor': upsample_factor,
        'feature_fusion': feature_fusion,
@@ -104,12 +108,14 @@ class SegmentationHead(tf.keras.layers.Layer):

  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
    """Creates the variables of the segmentation head."""
+    use_depthwise_convolution = self._config_dict['use_depthwise_convolution']
+    random_initializer = tf.keras.initializers.RandomNormal(stddev=0.01)
    conv_op = tf.keras.layers.Conv2D
    conv_kwargs = {
-        'kernel_size': 3,
+        'kernel_size': 3 if not use_depthwise_convolution else 1,
        'padding': 'same',
        'use_bias': False,
-        'kernel_initializer': tf.keras.initializers.RandomNormal(stddev=0.01),
+        'kernel_initializer': random_initializer,
        'kernel_regularizer': self._config_dict['kernel_regularizer'],
    }
    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
@@ -139,6 +145,16 @@ class SegmentationHead(tf.keras.layers.Layer):
    self._convs = []
    self._norms = []
    for i in range(self._config_dict['num_convs']):
+      if use_depthwise_convolution:
+        self._convs.append(
+            tf.keras.layers.DepthwiseConv2D(
+                name='segmentation_head_depthwise_conv_{}'.format(i),
+                kernel_size=3,
+                padding='same',
+                use_bias=False,
+                depthwise_initializer=random_initializer,
+                depthwise_regularizer=self._config_dict['kernel_regularizer'],
+                depth_multiplier=1))
      conv_name = 'segmentation_head_conv_{}'.format(i)
      self._convs.append(
          conv_op(