Merge pull request #10537 from srihari-humbarwadi:panoptic-deeplab

PiperOrigin-RevId: 452568716

Merge pull request #10537 from srihari-humbarwadi:panoptic-deeplab
PiperOrigin-RevId: 452568716
3e3b0c64 · A. Unique TensorFlower · 523c40b7 · 1f765c55 · 3e3b0c64 · 3e3b0c64
Commit 3e3b0c64 authored Jun 02, 2022 by A. Unique TensorFlower
18 changed files
--- a/official/vision/beta/projects/panoptic_maskrcnn/README.md
+++ b/official/vision/beta/projects/panoptic_maskrcnn/README.md
@@ -83,6 +83,12 @@ ResNet-50    | 3x           | `panoptic_fpn_coco`         | 40.64   |   36.29
 **Note**: Here 1x schedule refers to ~12 epochs
+### Panoptic Deeplab
+Backbone             | Experiment name                 | Overall PQ | Things PQ | Stuff PQ | Checkpoints
+:---------------------| :-------------------------------| ---------- | --------- | -------- | ------------:
+Dilated ResNet-50     | `panoptic_deeplab_resnet_coco`  |   36.80    |  37.51    |  35.73   | [ckpt](gs://tf_model_garden/vision/panoptic/panoptic_deeplab/coco/resnet50)
+Dilated ResNet-101    | `panoptic_deeplab_resnet_coco`  |   38.39    |  39.47    |  36.75   | [ckpt](gs://tf_model_garden/vision/panoptic/panoptic_deeplab/coco/resnet101)
 ___
 ## Citation
 ```
@@ -94,4 +100,12 @@ ___
      archivePrefix={arXiv},
      primaryClass={cs.CV}
 }
+@article{Cheng2020PanopticDeepLabAS,
+  title={Panoptic-DeepLab: A Simple, Strong, and Fast Baseline for Bottom-Up Panoptic Segmentation},
+  author={Bowen Cheng and Maxwell D. Collins and Yukun Zhu and Ting Liu and Thomas S. Huang and Hartwig Adam and Liang-Chieh Chen},
+  journal={2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2020},
+  pages={12472-12482}
+}
 ```
--- a/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_deeplab.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_deeplab.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Panoptic Deeplab configuration definition."""
+import dataclasses
+import os
+from typing import List, Optional, Union
+import numpy as np
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.modeling import optimization
+from official.vision.configs import common
+from official.vision.configs import decoders
+from official.vision.configs.google import backbones
+_COCO_INPUT_PATH_BASE = 'coco/tfrecords'
+_COCO_TRAIN_EXAMPLES = 118287
+_COCO_VAL_EXAMPLES = 5000
+@dataclasses.dataclass
+class Parser(hyperparams.Config):
+  """Panoptic deeplab parser."""
+  ignore_label: int = 0
+  # If resize_eval_groundtruth is set to False, original image sizes are used
+  # for eval. In that case, groundtruth_padded_size has to be specified too to
+  # allow for batching the variable input sizes of images.
+  resize_eval_groundtruth: bool = True
+  groundtruth_padded_size: List[int] = dataclasses.field(default_factory=list)
+  aug_scale_min: float = 1.0
+  aug_scale_max: float = 1.0
+  aug_rand_hflip: bool = True
+  aug_type: common.Augmentation = common.Augmentation()
+  sigma: float = 8.0
+  small_instance_area_threshold: int = 4096
+  small_instance_weight: float = 3.0
+  dtype = 'float32'
+@dataclasses.dataclass
+class TfExampleDecoder(common.TfExampleDecoder):
+  """A simple TF Example decoder config."""
+  panoptic_category_mask_key: str = 'image/panoptic/category_mask'
+  panoptic_instance_mask_key: str = 'image/panoptic/instance_mask'
+@dataclasses.dataclass
+class DataDecoder(common.DataDecoder):
+  """Data decoder config."""
+  simple_decoder: TfExampleDecoder = TfExampleDecoder()
+@dataclasses.dataclass
+class DataConfig(cfg.DataConfig):
+  """Input config for training."""
+  decoder: DataDecoder = DataDecoder()
+  parser: Parser = Parser()
+  input_path: str = ''
+  drop_remainder: bool = True
+  file_type: str = 'tfrecord'
+  is_training: bool = True
+  global_batch_size: int = 1
+@dataclasses.dataclass
+class PanopticDeeplabHead(hyperparams.Config):
+  """Panoptic Deeplab head config."""
+  level: int = 3
+  num_convs: int = 2
+  num_filters: int = 256
+  kernel_size: int = 5
+  use_depthwise_convolution: bool = False
+  upsample_factor: int = 1
+  low_level: List[int] = dataclasses.field(default_factory=lambda: [3, 2])
+  low_level_num_filters: List[int] = dataclasses.field(
+      default_factory=lambda: [64, 32])
+  fusion_num_output_filters: int = 256
+@dataclasses.dataclass
+class SemanticHead(PanopticDeeplabHead):
+  """Semantic head config."""
+  prediction_kernel_size: int = 1
+@dataclasses.dataclass
+class InstanceHead(PanopticDeeplabHead):
+  """Instance head config."""
+  prediction_kernel_size: int = 1
+@dataclasses.dataclass
+class PanopticDeeplabPostProcessor(hyperparams.Config):
+  """Panoptic Deeplab PostProcessing config."""
+  output_size: List[int] = dataclasses.field(
+      default_factory=list)
+  center_score_threshold: float = 0.1
+  thing_class_ids: List[int] = dataclasses.field(default_factory=list)
+  label_divisor: int = 256 * 256 * 256
+  stuff_area_limit: int = 4096
+  ignore_label: int = 0
+  nms_kernel: int = 7
+  keep_k_centers: int = 200
+  rescale_predictions: bool = True
+@dataclasses.dataclass
+class PanopticDeeplab(hyperparams.Config):
+  """Panoptic Deeplab model config."""
+  num_classes: int = 2
+  input_size: List[int] = dataclasses.field(default_factory=list)
+  min_level: int = 3
+  max_level: int = 6
+  norm_activation: common.NormActivation = common.NormActivation()
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='resnet', resnet=backbones.ResNet())
+  decoder: decoders.Decoder = decoders.Decoder(type='aspp')
+  semantic_head: SemanticHead = SemanticHead()
+  instance_head: InstanceHead = InstanceHead()
+  shared_decoder: bool = False
+  generate_panoptic_masks: bool = True
+  post_processor: PanopticDeeplabPostProcessor = PanopticDeeplabPostProcessor()
+@dataclasses.dataclass
+class Losses(hyperparams.Config):
+  label_smoothing: float = 0.0
+  ignore_label: int = 0
+  class_weights: List[float] = dataclasses.field(default_factory=list)
+  l2_weight_decay: float = 1e-4
+  top_k_percent_pixels: float = 0.15
+  segmentation_loss_weight: float = 1.0
+  center_heatmap_loss_weight: float = 200
+  center_offset_loss_weight: float = 0.01
+@dataclasses.dataclass
+class Evaluation(hyperparams.Config):
+  """Evaluation config."""
+  ignored_label: int = 0
+  max_instances_per_category: int = 256
+  offset: int = 256 * 256 * 256
+  is_thing: List[float] = dataclasses.field(
+      default_factory=list)
+  rescale_predictions: bool = True
+  report_per_class_pq: bool = False
+  report_per_class_iou: bool = False
+  report_train_mean_iou: bool = True  # Turning this off can speed up training.
+@dataclasses.dataclass
+class PanopticDeeplabTask(cfg.TaskConfig):
+  """Panoptic deeplab task config."""
+  model: PanopticDeeplab = PanopticDeeplab()
+  train_data: DataConfig = DataConfig(is_training=True)
+  validation_data: DataConfig = DataConfig(
+      is_training=False,
+      drop_remainder=False)
+  losses: Losses = Losses()
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: Union[
+      str, List[str]] = 'all'  # all, backbone, and/or decoder
+  evaluation: Evaluation = Evaluation()
+@exp_factory.register_config_factory('panoptic_deeplab_resnet_coco')
+def panoptic_deeplab_coco() -> cfg.ExperimentConfig:
+  """COCO panoptic segmentation with Panoptic Deeplab."""
+  train_steps = 200000
+  train_batch_size = 64
+  eval_batch_size = 1
+  steps_per_epoch = _COCO_TRAIN_EXAMPLES // train_batch_size
+  validation_steps = _COCO_VAL_EXAMPLES // eval_batch_size
+  num_panoptic_categories = 201
+  num_thing_categories = 91
+  ignore_label = 0
+  is_thing = [False]
+  for idx in range(1, num_panoptic_categories):
+    is_thing.append(True if idx <= num_thing_categories else False)
+  input_size = [640, 640, 3]
+  output_stride = 16
+  aspp_dilation_rates = [6, 12, 18]
+  multigrid = [1, 2, 4]
+  stem_type = 'v1'
+  level = int(np.math.log2(output_stride))
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(
+          mixed_precision_dtype='bfloat16', enable_xla=True),
+      task=PanopticDeeplabTask(
+          init_checkpoint='gs://tf_model_garden/vision/panoptic/panoptic_deeplab/imagenet/resnet50_v1/ckpt-436800',  # pylint: disable=line-too-long
+          init_checkpoint_modules=['backbone'],
+          model=PanopticDeeplab(
+              num_classes=num_panoptic_categories,
+              input_size=input_size,
+              backbone=backbones.Backbone(
+                  type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
+                      model_id=50,
+                      stem_type=stem_type,
+                      output_stride=output_stride,
+                      multigrid=multigrid,
+                      se_ratio=0.25,
+                      last_stage_repeats=1,
+                      stochastic_depth_drop_rate=0.2)),
+              decoder=decoders.Decoder(
+                  type='aspp',
+                  aspp=decoders.ASPP(
+                      level=level,
+                      num_filters=256,
+                      pool_kernel_size=input_size[:2],
+                      dilation_rates=aspp_dilation_rates,
+                      use_depthwise_convolution=True,
+                      dropout_rate=0.1)),
+              semantic_head=SemanticHead(
+                  level=level,
+                  num_convs=1,
+                  num_filters=256,
+                  kernel_size=5,
+                  use_depthwise_convolution=True,
+                  upsample_factor=1,
+                  low_level=[3, 2],
+                  low_level_num_filters=[64, 32],
+                  fusion_num_output_filters=256,
+                  prediction_kernel_size=1),
+              instance_head=InstanceHead(
+                  level=level,
+                  num_convs=1,
+                  num_filters=32,
+                  kernel_size=5,
+                  use_depthwise_convolution=True,
+                  upsample_factor=1,
+                  low_level=[3, 2],
+                  low_level_num_filters=[32, 16],
+                  fusion_num_output_filters=128,
+                  prediction_kernel_size=1),
+              shared_decoder=False,
+              generate_panoptic_masks=True,
+              post_processor=PanopticDeeplabPostProcessor(
+                  output_size=input_size[:2],
+                  center_score_threshold=0.1,
+                  thing_class_ids=list(range(1, num_thing_categories)),
+                  label_divisor=256,
+                  stuff_area_limit=4096,
+                  ignore_label=ignore_label,
+                  nms_kernel=41,
+                  keep_k_centers=200,
+                  rescale_predictions=True)),
+          losses=Losses(
+              label_smoothing=0.0,
+              ignore_label=ignore_label,
+              l2_weight_decay=0.0,
+              top_k_percent_pixels=0.2,
+              segmentation_loss_weight=1.0,
+              center_heatmap_loss_weight=200,
+              center_offset_loss_weight=0.01),
+          train_data=DataConfig(
+              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_scale_min=0.5,
+                  aug_scale_max=1.5,
+                  aug_rand_hflip=True,
+                  aug_type=common.Augmentation(
+                      type='autoaug',
+                      autoaug=common.AutoAugment(
+                          augmentation_name='panoptic_deeplab_policy')),
+                  sigma=8.0,
+                  small_instance_area_threshold=4096,
+                  small_instance_weight=3.0)),
+          validation_data=DataConfig(
+              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              parser=Parser(
+                  resize_eval_groundtruth=False,
+                  groundtruth_padded_size=[640, 640],
+                  aug_scale_min=1.0,
+                  aug_scale_max=1.0,
+                  aug_rand_hflip=False,
+                  aug_type=None,
+                  sigma=8.0,
+                  small_instance_area_threshold=4096,
+                  small_instance_weight=3.0),
+              drop_remainder=False),
+          evaluation=Evaluation(
+              ignored_label=ignore_label,
+              max_instances_per_category=256,
+              offset=256*256*256,
+              is_thing=is_thing,
+              rescale_predictions=True,
+              report_per_class_pq=False,
+              report_per_class_iou=False,
+              report_train_mean_iou=False)),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_steps,
+          validation_steps=validation_steps,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adam',
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 0.0005,
+                      'decay_steps': train_steps,
+                      'end_learning_rate': 0.0,
+                      'power': 0.9
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 2000,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
--- a/official/vision/beta/projects/panoptic_maskrcnn/dataloaders/panoptic_deeplab_input.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/dataloaders/panoptic_deeplab_input.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data parser and processing for Panoptic Deeplab."""
+from typing import List, Optional
+import numpy as np
+import tensorflow as tf
+from official.vision.configs import common
+from official.vision.dataloaders import parser
+from official.vision.dataloaders import tf_example_decoder
+from official.vision.ops import augment
+from official.vision.ops import preprocess_ops
+def _compute_gaussian_from_std(sigma):
+  """Computes the Gaussian and its size from a given standard deviation."""
+  size = int(6 * sigma + 3)
+  x = np.arange(size, dtype=np.float)
+  y = x[:, np.newaxis]
+  x0, y0 = 3 * sigma + 1, 3 * sigma + 1
+  gaussian = tf.constant(
+      np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)),
+      dtype=tf.float32)
+  return gaussian, size
+class TfExampleDecoder(tf_example_decoder.TfExampleDecoder):
+  """Tensorflow Example proto decoder."""
+  def __init__(
+      self,
+      regenerate_source_id: bool,
+      panoptic_category_mask_key: str = 'image/panoptic/category_mask',
+      panoptic_instance_mask_key: str = 'image/panoptic/instance_mask'):
+    super(TfExampleDecoder,
+          self).__init__(
+              include_mask=True,
+              regenerate_source_id=regenerate_source_id)
+    self._panoptic_category_mask_key = panoptic_category_mask_key
+    self._panoptic_instance_mask_key = panoptic_instance_mask_key
+    self._panoptic_keys_to_features = {
+        panoptic_category_mask_key:
+            tf.io.FixedLenFeature((), tf.string, default_value=''),
+        panoptic_instance_mask_key:
+            tf.io.FixedLenFeature((), tf.string, default_value='')
+    }
+  def decode(self, serialized_example):
+    decoded_tensors = super(TfExampleDecoder,
+                            self).decode(serialized_example)
+    parsed_tensors = tf.io.parse_single_example(
+        serialized_example, self._panoptic_keys_to_features)
+    category_mask = tf.io.decode_image(
+        parsed_tensors[self._panoptic_category_mask_key], channels=1)
+    instance_mask = tf.io.decode_image(
+        parsed_tensors[self._panoptic_instance_mask_key], channels=1)
+    category_mask.set_shape([None, None, 1])
+    instance_mask.set_shape([None, None, 1])
+    decoded_tensors.update({
+        'groundtruth_panoptic_category_mask': category_mask,
+        'groundtruth_panoptic_instance_mask': instance_mask
+    })
+    return decoded_tensors
+class Parser(parser.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+  def __init__(
+      self,
+      output_size: List[int],
+      resize_eval_groundtruth: bool = True,
+      groundtruth_padded_size: Optional[List[int]] = None,
+      ignore_label: int = 0,
+      aug_rand_hflip: bool = False,
+      aug_scale_min: float = 1.0,
+      aug_scale_max: float = 1.0,
+      aug_type: Optional[common.Augmentation] = None,
+      sigma: float = 8.0,
+      small_instance_area_threshold: int = 4096,
+      small_instance_weight: float = 3.0,
+      dtype: str = 'float32'):
+    """Initializes parameters for parsing annotations in the dataset.
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      resize_eval_groundtruth: `bool`, if True, eval groundtruth masks are
+        resized to output_size.
+      groundtruth_padded_size: `Tensor` or `list` for [height, width]. When
+        resize_eval_groundtruth is set to False, the groundtruth masks are
+        padded to this size.
+      ignore_label: `int` the pixel with ignore label will not used for training
+        and evaluation.
+      aug_rand_hflip: `bool`, if True, augment training with random
+        horizontal flip.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      aug_type: An optional Augmentation object with params for AutoAugment.
+      sigma: `float`, standard deviation for generating 2D Gaussian to encode
+        centers.
+      small_instance_area_threshold: `int`, small instance area threshold.
+      small_instance_weight: `float`, small instance weight.
+      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
+    """
+    self._output_size = output_size
+    self._resize_eval_groundtruth = resize_eval_groundtruth
+    if (not resize_eval_groundtruth) and (groundtruth_padded_size is None):
+      raise ValueError(
+          'groundtruth_padded_size ([height, width]) needs to be'
+          'specified when resize_eval_groundtruth is False.')
+    self._groundtruth_padded_size = groundtruth_padded_size
+    self._ignore_label = ignore_label
+    # Data augmentation.
+    self._aug_rand_hflip = aug_rand_hflip
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+    if aug_type and aug_type.type:
+      if aug_type.type == 'autoaug':
+        self._augmenter = augment.AutoAugment(
+            augmentation_name=aug_type.autoaug.augmentation_name,
+            cutout_const=aug_type.autoaug.cutout_const,
+            translate_const=aug_type.autoaug.translate_const)
+      else:
+        raise ValueError('Augmentation policy {} not supported.'.format(
+            aug_type.type))
+    else:
+      self._augmenter = None
+    self._dtype = dtype
+    self._sigma = sigma
+    self._gaussian, self._gaussian_size = _compute_gaussian_from_std(
+        self._sigma)
+    self._gaussian = tf.reshape(self._gaussian, shape=[-1])
+    self._small_instance_area_threshold = small_instance_area_threshold
+    self._small_instance_weight = small_instance_weight
+  def _resize_and_crop_mask(self, mask, image_info, is_training):
+    """Resizes and crops mask using `image_info` dict."""
+    height = image_info[0][0]
+    width = image_info[0][1]
+    mask = tf.reshape(mask, shape=[1, height, width, 1])
+    mask += 1
+    if is_training or self._resize_eval_groundtruth:
+      image_scale = image_info[2, :]
+      offset = image_info[3, :]
+      mask = preprocess_ops.resize_and_crop_masks(
+          mask,
+          image_scale,
+          self._output_size,
+          offset)
+    else:
+      mask = tf.image.pad_to_bounding_box(
+          mask, 0, 0,
+          self._groundtruth_padded_size[0],
+          self._groundtruth_padded_size[1])
+    mask -= 1
+    # Assign ignore label to the padded region.
+    mask = tf.where(
+        tf.equal(mask, -1),
+        self._ignore_label * tf.ones_like(mask),
+        mask)
+    mask = tf.squeeze(mask, axis=0)
+    return mask
+  def _parse_data(self, data, is_training):
+    image = data['image']
+    if self._augmenter is not None and is_training:
+      image = self._augmenter.distort(image)
+    image = preprocess_ops.normalize_image(image)
+    category_mask = tf.cast(
+        data['groundtruth_panoptic_category_mask'][:, :, 0],
+        dtype=tf.float32)
+    instance_mask = tf.cast(
+        data['groundtruth_panoptic_instance_mask'][:, :, 0],
+        dtype=tf.float32)
+    # Flips image randomly during training.
+    if self._aug_rand_hflip and is_training:
+      masks = tf.stack([category_mask, instance_mask], axis=0)
+      image, _, masks = preprocess_ops.random_horizontal_flip(
+          image=image, masks=masks)
+      category_mask = masks[0]
+      instance_mask = masks[1]
+    # Resizes and crops image.
+    image, image_info = preprocess_ops.resize_and_crop_image(
+        image,
+        self._output_size,
+        self._output_size,
+        aug_scale_min=self._aug_scale_min if is_training else 1.0,
+        aug_scale_max=self._aug_scale_max if is_training else 1.0)
+    category_mask = self._resize_and_crop_mask(
+        category_mask,
+        image_info,
+        is_training=is_training)
+    instance_mask = self._resize_and_crop_mask(
+        instance_mask,
+        image_info,
+        is_training=is_training)
+    (instance_centers_heatmap,
+     instance_centers_offset,
+     semantic_weights) = self._encode_centers_and_offets(
+         instance_mask=instance_mask[:, :, 0])
+    # Cast image and labels as self._dtype
+    image = tf.cast(image, dtype=self._dtype)
+    category_mask = tf.cast(category_mask, dtype=self._dtype)
+    instance_mask = tf.cast(instance_mask, dtype=self._dtype)
+    instance_centers_heatmap = tf.cast(
+        instance_centers_heatmap, dtype=self._dtype)
+    instance_centers_offset = tf.cast(
+        instance_centers_offset, dtype=self._dtype)
+    valid_mask = tf.not_equal(
+        category_mask, self._ignore_label)
+    things_mask = tf.not_equal(
+        instance_mask, self._ignore_label)
+    labels = {
+        'category_mask': category_mask,
+        'instance_mask': instance_mask,
+        'instance_centers_heatmap': instance_centers_heatmap,
+        'instance_centers_offset': instance_centers_offset,
+        'semantic_weights': semantic_weights,
+        'valid_mask': valid_mask,
+        'things_mask': things_mask,
+        'image_info': image_info
+    }
+    return image, labels
+  def _parse_train_data(self, data):
+    """Parses data for training."""
+    return self._parse_data(data=data, is_training=True)
+  def _parse_eval_data(self, data):
+    """Parses data for evaluation."""
+    return self._parse_data(data=data, is_training=False)
+  def _encode_centers_and_offets(self, instance_mask):
+    """Generates center heatmaps and offets from instance id mask.
+    Args:
+      instance_mask: `tf.Tensor` of shape [height, width] representing
+        groundtruth instance id mask.
+    Returns:
+      instance_centers_heatmap: `tf.Tensor` of shape [height, width, 1]
+      instance_centers_offset: `tf.Tensor` of shape [height, width, 2]
+    """
+    shape = tf.shape(instance_mask)
+    height, width = shape[0], shape[1]
+    padding_start = int(3 * self._sigma + 1)
+    padding_end = int(3 * self._sigma + 2)
+    # padding should be equal to self._gaussian_size which is calculated
+    # as size = int(6 * sigma + 3)
+    padding = padding_start + padding_end
+    instance_centers_heatmap = tf.zeros(
+        shape=[height + padding, width + padding],
+        dtype=tf.float32)
+    centers_offset_y = tf.zeros(
+        shape=[height, width],
+        dtype=tf.float32)
+    centers_offset_x = tf.zeros(
+        shape=[height, width],
+        dtype=tf.float32)
+    semantic_weights = tf.ones(
+        shape=[height, width],
+        dtype=tf.float32)
+    unique_instance_ids, _ = tf.unique(tf.reshape(instance_mask, [-1]))
+    # The following method for encoding center heatmaps and offets is inspired
+    # by the reference implementation available at
+    # https://github.com/google-research/deeplab2/blob/main/data/sample_generator.py  # pylint: disable=line-too-long
+    for instance_id in unique_instance_ids:
+      if instance_id == self._ignore_label:
+        continue
+      mask = tf.equal(instance_mask, instance_id)
+      mask_area = tf.reduce_sum(tf.cast(mask, dtype=tf.float32))
+      mask_indices = tf.cast(tf.where(mask), dtype=tf.float32)
+      mask_center = tf.reduce_mean(mask_indices, axis=0)
+      mask_center_y = tf.cast(tf.round(mask_center[0]), dtype=tf.int32)
+      mask_center_x = tf.cast(tf.round(mask_center[1]), dtype=tf.int32)
+      if mask_area < self._small_instance_area_threshold:
+        semantic_weights = tf.where(
+            mask,
+            self._small_instance_weight,
+            semantic_weights)
+      gaussian_size = self._gaussian_size
+      indices_y = tf.range(mask_center_y, mask_center_y + gaussian_size)
+      indices_x = tf.range(mask_center_x, mask_center_x + gaussian_size)
+      indices = tf.stack(tf.meshgrid(indices_y, indices_x))
+      indices = tf.reshape(
+          indices, shape=[2, gaussian_size * gaussian_size])
+      indices = tf.transpose(indices)
+      instance_centers_heatmap = tf.tensor_scatter_nd_max(
+          tensor=instance_centers_heatmap,
+          indices=indices,
+          updates=self._gaussian)
+      centers_offset_y = tf.tensor_scatter_nd_update(
+          tensor=centers_offset_y,
+          indices=tf.cast(mask_indices, dtype=tf.int32),
+          updates=tf.cast(mask_center_y, dtype=tf.float32) - mask_indices[:, 0])
+      centers_offset_x = tf.tensor_scatter_nd_update(
+          tensor=centers_offset_x,
+          indices=tf.cast(mask_indices, dtype=tf.int32),
+          updates=tf.cast(mask_center_x, dtype=tf.float32) - mask_indices[:, 1])
+    instance_centers_heatmap = instance_centers_heatmap[
+        padding_start:padding_start + height,
+        padding_start:padding_start + width]
+    instance_centers_heatmap = tf.expand_dims(instance_centers_heatmap, axis=-1)
+    instance_centers_offset = tf.stack(
+        [centers_offset_y, centers_offset_x],
+        axis=-1)
+    return (instance_centers_heatmap,
+            instance_centers_offset,
+            semantic_weights)
--- a/official/vision/beta/projects/panoptic_maskrcnn/losses/panoptic_deeplab_losses.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/losses/panoptic_deeplab_losses.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Losses used for panoptic deeplab model."""
+import tensorflow as tf
+from official.modeling import tf_utils
+from official.vision.beta.projects.panoptic_maskrcnn.ops import mask_ops
+EPSILON = 1e-5
+class WeightedBootstrappedCrossEntropyLoss:
+  """Weighted semantic segmentation loss."""
+  def __init__(self, label_smoothing, class_weights, ignore_label,
+               top_k_percent_pixels=1.0):
+    self._top_k_percent_pixels = top_k_percent_pixels
+    self._class_weights = class_weights
+    self._ignore_label = ignore_label
+    self._label_smoothing = label_smoothing
+  def __call__(self, logits, labels, sample_weight=None):
+    _, _, _, num_classes = logits.get_shape().as_list()
+    logits = tf.image.resize(
+        logits, tf.shape(labels)[1:3],
+        method=tf.image.ResizeMethod.BILINEAR)
+    valid_mask = tf.not_equal(labels, self._ignore_label)
+    normalizer = tf.reduce_sum(tf.cast(valid_mask, tf.float32)) + EPSILON
+    # Assign pixel with ignore label to class 0 (background). The loss on the
+    # pixel will later be masked out.
+    labels = tf.where(valid_mask, labels, tf.zeros_like(labels))
+    labels = tf.squeeze(tf.cast(labels, tf.int32), axis=3)
+    valid_mask = tf.squeeze(tf.cast(valid_mask, tf.float32), axis=3)
+    onehot_labels = tf.one_hot(labels, num_classes)
+    onehot_labels = onehot_labels * (
+        1 - self._label_smoothing) + self._label_smoothing / num_classes
+    cross_entropy_loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=onehot_labels, logits=logits)
+    if not self._class_weights:
+      class_weights = [1] * num_classes
+    else:
+      class_weights = self._class_weights
+    if num_classes != len(class_weights):
+      raise ValueError(
+          'Length of class_weights should be {}'.format(num_classes))
+    weight_mask = tf.einsum('...y,y->...',
+                            tf.one_hot(labels, num_classes, dtype=tf.float32),
+                            tf.constant(class_weights, tf.float32))
+    valid_mask *= weight_mask
+    if sample_weight is not None:
+      valid_mask *= sample_weight
+    cross_entropy_loss *= tf.cast(valid_mask, tf.float32)
+    if self._top_k_percent_pixels >= 1.0:
+      loss = tf.reduce_sum(cross_entropy_loss) / normalizer
+    else:
+      loss = self._compute_top_k_loss(cross_entropy_loss)
+    return loss
+  def _compute_top_k_loss(self, loss):
+    """Computs top k loss."""
+    batch_size = tf.shape(loss)[0]
+    loss = tf.reshape(loss, shape=[batch_size, -1])
+    top_k_pixels = tf.cast(
+        self._top_k_percent_pixels *
+        tf.cast(tf.shape(loss)[-1], dtype=tf.float32),
+        dtype=tf.int32)
+    # shape: [batch_size, top_k_pixels]
+    per_sample_top_k_loss = tf.map_fn(
+        fn=lambda x: tf.nn.top_k(x, k=top_k_pixels, sorted=False)[0],
+        elems=loss,
+        parallel_iterations=32,
+        fn_output_signature=tf.float32)
+    # shape: [batch_size]
+    per_sample_normalizer = tf.reduce_sum(
+        tf.cast(
+            tf.not_equal(per_sample_top_k_loss, 0.0),
+            dtype=tf.float32),
+        axis=-1) + EPSILON
+    per_sample_normalized_loss = tf.reduce_sum(
+        per_sample_top_k_loss, axis=-1) / per_sample_normalizer
+    normalized_loss = tf_utils.safe_mean(per_sample_normalized_loss)
+    return normalized_loss
+class CenterHeatmapLoss:
+  """Center heatmap loss."""
+  def __init__(self):
+    self._loss_fn = tf.losses.mean_squared_error
+  def __call__(self, logits, labels, sample_weight=None):
+    _, height, width, _ = labels.get_shape().as_list()
+    logits = tf.image.resize(
+        logits,
+        size=[height, width],
+        method=tf.image.ResizeMethod.BILINEAR)
+    loss = self._loss_fn(y_true=labels, y_pred=logits)
+    if sample_weight is not None:
+      loss *= sample_weight
+    return tf_utils.safe_mean(loss)
+class CenterOffsetLoss:
+  """Center offset loss."""
+  def __init__(self):
+    self._loss_fn = tf.losses.mean_absolute_error
+  def __call__(self, logits, labels, sample_weight=None):
+    _, height, width, _ = labels.get_shape().as_list()
+    logits = mask_ops.resize_and_rescale_offsets(
+        logits, target_size=[height, width])
+    loss = self._loss_fn(y_true=labels, y_pred=logits)
+    if sample_weight is not None:
+      loss *= sample_weight
+    return tf_utils.safe_mean(loss)
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 """Factory method to build panoptic segmentation model."""
+from typing import Optional
 import tensorflow as tf
 from official.projects.deepmac_maskrcnn.tasks import deep_mask_head_rcnn
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_deeplab as panoptic_deeplab_cfg
 from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn as panoptic_maskrcnn_cfg
+from official.vision.beta.projects.panoptic_maskrcnn.modeling import panoptic_deeplab_model
 from official.vision.beta.projects.panoptic_maskrcnn.modeling import panoptic_maskrcnn_model
+from official.vision.beta.projects.panoptic_maskrcnn.modeling.heads import panoptic_deeplab_heads
+from official.vision.beta.projects.panoptic_maskrcnn.modeling.layers import panoptic_deeplab_merge
 from official.vision.beta.projects.panoptic_maskrcnn.modeling.layers import panoptic_segmentation_generator
 from official.vision.modeling import backbones
 from official.vision.modeling.decoders import factory as decoder_factory
@@ -142,3 +147,104 @@ def build_panoptic_maskrcnn(
      aspect_ratios=model_config.anchor.aspect_ratios,
      anchor_size=model_config.anchor.anchor_size)
  return model
+def build_panoptic_deeplab(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: panoptic_deeplab_cfg.PanopticDeeplab,
+    l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None
+) -> tf.keras.Model:
+  """Builds Panoptic Deeplab model.
+  Args:
+    input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+    model_config: Config instance for the panoptic deeplab model.
+    l2_regularizer: Optional `tf.keras.regularizers.Regularizer`, if specified,
+      the model is built with the provided regularization layer.
+  Returns:
+    tf.keras.Model for the panoptic segmentation model.
+  """
+  norm_activation_config = model_config.norm_activation
+  backbone = backbones.factory.build_backbone(
+      input_specs=input_specs,
+      backbone_config=model_config.backbone,
+      norm_activation_config=norm_activation_config,
+      l2_regularizer=l2_regularizer)
+  semantic_decoder = decoder_factory.build_decoder(
+      input_specs=backbone.output_specs,
+      model_config=model_config,
+      l2_regularizer=l2_regularizer)
+  if model_config.shared_decoder:
+    instance_decoder = None
+  else:
+    # semantic and instance share the same decoder type
+    instance_decoder = decoder_factory.build_decoder(
+        input_specs=backbone.output_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+  semantic_head_config = model_config.semantic_head
+  instance_head_config = model_config.instance_head
+  semantic_head = panoptic_deeplab_heads.SemanticHead(
+      num_classes=model_config.num_classes,
+      level=semantic_head_config.level,
+      num_convs=semantic_head_config.num_convs,
+      kernel_size=semantic_head_config.kernel_size,
+      prediction_kernel_size=semantic_head_config.prediction_kernel_size,
+      num_filters=semantic_head_config.num_filters,
+      use_depthwise_convolution=semantic_head_config.use_depthwise_convolution,
+      upsample_factor=semantic_head_config.upsample_factor,
+      low_level=semantic_head_config.low_level,
+      low_level_num_filters=semantic_head_config.low_level_num_filters,
+      fusion_num_output_filters=semantic_head_config.fusion_num_output_filters,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+  instance_head = panoptic_deeplab_heads.InstanceHead(
+      level=instance_head_config.level,
+      num_convs=instance_head_config.num_convs,
+      kernel_size=instance_head_config.kernel_size,
+      prediction_kernel_size=instance_head_config.prediction_kernel_size,
+      num_filters=instance_head_config.num_filters,
+      use_depthwise_convolution=instance_head_config.use_depthwise_convolution,
+      upsample_factor=instance_head_config.upsample_factor,
+      low_level=instance_head_config.low_level,
+      low_level_num_filters=instance_head_config.low_level_num_filters,
+      fusion_num_output_filters=instance_head_config.fusion_num_output_filters,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+  if model_config.generate_panoptic_masks:
+    post_processing_config = model_config.post_processor
+    post_processor = panoptic_deeplab_merge.PostProcessor(
+        output_size=post_processing_config.output_size,
+        center_score_threshold=post_processing_config.center_score_threshold,
+        thing_class_ids=post_processing_config.thing_class_ids,
+        label_divisor=post_processing_config.label_divisor,
+        stuff_area_limit=post_processing_config.stuff_area_limit,
+        ignore_label=post_processing_config.ignore_label,
+        nms_kernel=post_processing_config.nms_kernel,
+        keep_k_centers=post_processing_config.keep_k_centers,
+        rescale_predictions=post_processing_config.rescale_predictions)
+  else:
+    post_processor = None
+  model = panoptic_deeplab_model.PanopticDeeplabModel(
+      backbone=backbone,
+      semantic_decoder=semantic_decoder,
+      instance_decoder=instance_decoder,
+      semantic_head=semantic_head,
+      instance_head=instance_head,
+      post_processor=post_processor)
+  return model
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory_test.py
@@ -18,6 +18,8 @@ from absl.testing import parameterized
 import numpy as np
 import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_deeplab as panoptic_deeplab_cfg
 from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn as panoptic_maskrcnn_cfg
 from official.vision.beta.projects.panoptic_maskrcnn.modeling import factory
 from official.vision.configs import backbones
@@ -62,5 +64,51 @@ class PanopticMaskRCNNBuilderTest(parameterized.TestCase, tf.test.TestCase):
        model_config=model_config,
        l2_regularizer=l2_regularizer)
+class PanopticDeeplabBuilderTest(parameterized.TestCase, tf.test.TestCase):
+  @combinations.generate(
+      combinations.combine(
+          input_size=[(640, 640), (512, 512)],
+          backbone_type=['resnet', 'dilated_resnet'],
+          decoder_type=['aspp', 'fpn'],
+          level=[2, 3, 4],
+          low_level=[(4, 3), (3, 2)],
+          shared_decoder=[True, False],
+          generate_panoptic_masks=[True, False]))
+  def test_builder(self, input_size, backbone_type,
+                   level, low_level, decoder_type,
+                   shared_decoder, generate_panoptic_masks):
+    num_classes = 10
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], 3])
+    model_config = panoptic_deeplab_cfg.PanopticDeeplab(
+        num_classes=num_classes,
+        input_size=input_size,
+        backbone=backbones.Backbone(type=backbone_type),
+        decoder=decoders.Decoder(type=decoder_type),
+        semantic_head=panoptic_deeplab_cfg.SemanticHead(
+            level=level,
+            num_convs=1,
+            kernel_size=5,
+            prediction_kernel_size=1,
+            low_level=low_level),
+        instance_head=panoptic_deeplab_cfg.InstanceHead(
+            level=level,
+            num_convs=1,
+            kernel_size=5,
+            prediction_kernel_size=1,
+            low_level=low_level),
+        shared_decoder=shared_decoder,
+        generate_panoptic_masks=generate_panoptic_masks)
+    l2_regularizer = tf.keras.regularizers.l2(5e-5)
+    _ = factory.build_panoptic_deeplab(
+        input_specs=input_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
 if __name__ == '__main__':
  tf.test.main()
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/heads/panoptic_deeplab_heads.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/heads/panoptic_deeplab_heads.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains definitions for Panoptic Deeplab heads."""
+from typing import List, Union, Optional, Mapping, Tuple
+import tensorflow as tf
+from official.modeling import tf_utils
+from official.vision.beta.projects.panoptic_maskrcnn.modeling.layers import fusion_layers
+from official.vision.ops import spatial_transform_ops
+class PanopticDeeplabHead(tf.keras.layers.Layer):
+  """Creates a panoptic deeplab head."""
+  def __init__(
+      self,
+      level: Union[int, str],
+      num_convs: int = 2,
+      num_filters: int = 256,
+      kernel_size: int = 3,
+      use_depthwise_convolution: bool = False,
+      upsample_factor: int = 1,
+      low_level: Optional[List[int]] = None,
+      low_level_num_filters: Optional[List[int]] = None,
+      fusion_num_output_filters: int = 256,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a panoptic deeplab head.
+    Args:
+      level: An `int` or `str`, level to use to build head.
+      num_convs: An `int` number of stacked convolution before the last
+        prediction layer.
+      num_filters: An `int` number to specify the number of filters used.
+        Default is 256.
+      kernel_size:  An `int` number to specify the kernel size of the
+        stacked convolutions before the last prediction layer.
+      use_depthwise_convolution: A bool to specify if use depthwise separable
+        convolutions.
+      upsample_factor: An `int` number to specify the upsampling factor to
+        generate finer mask. Default 1 means no upsampling is applied.
+      low_level: An `int` of backbone level to be used for feature fusion. It is
+        used when feature_fusion is set to `deeplabv3plus`.
+      low_level_num_filters: An `int` of reduced number of filters for the low
+        level features before fusing it with higher level features. It is only
+        used when feature_fusion is set to `deeplabv3plus`.
+      fusion_num_output_filters: An `int` number to specify the number of
+        filters used by output layer of fusion module. Default is 256.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(PanopticDeeplabHead, self).__init__(**kwargs)
+    self._config_dict = {
+        'level': level,
+        'num_convs': num_convs,
+        'num_filters': num_filters,
+        'kernel_size': kernel_size,
+        'use_depthwise_convolution': use_depthwise_convolution,
+        'upsample_factor': upsample_factor,
+        'low_level': low_level,
+        'low_level_num_filters': low_level_num_filters,
+        'fusion_num_output_filters': fusion_num_output_filters,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer
+    }
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the head."""
+    kernel_size = self._config_dict['kernel_size']
+    use_depthwise_convolution = self._config_dict['use_depthwise_convolution']
+    random_initializer = tf.keras.initializers.RandomNormal(stddev=0.01)
+    conv_op = tf.keras.layers.Conv2D
+    conv_kwargs = {
+        'kernel_size': kernel_size if not use_depthwise_convolution else 1,
+        'padding': 'same',
+        'use_bias': True,
+        'kernel_initializer': random_initializer,
+        'kernel_regularizer': self._config_dict['kernel_regularizer'],
+    }
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._bn_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+    self._panoptic_deeplab_fusion = fusion_layers.PanopticDeepLabFusion(
+        level=self._config_dict['level'],
+        low_level=self._config_dict['low_level'],
+        num_projection_filters=self._config_dict['low_level_num_filters'],
+        num_output_filters=self._config_dict['fusion_num_output_filters'],
+        use_depthwise_convolution=self
+        ._config_dict['use_depthwise_convolution'],
+        activation=self._config_dict['activation'],
+        use_sync_bn=self._config_dict['use_sync_bn'],
+        norm_momentum=self._config_dict['norm_momentum'],
+        norm_epsilon=self._config_dict['norm_epsilon'],
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'])
+    # Stacked convolutions layers.
+    self._convs = []
+    self._norms = []
+    for i in range(self._config_dict['num_convs']):
+      if use_depthwise_convolution:
+        self._convs.append(
+            tf.keras.layers.DepthwiseConv2D(
+                name='panoptic_deeplab_head_depthwise_conv_{}'.format(i),
+                kernel_size=kernel_size,
+                padding='same',
+                use_bias=True,
+                depthwise_initializer=random_initializer,
+                depthwise_regularizer=self._config_dict['kernel_regularizer'],
+                depth_multiplier=1))
+        norm_name = 'panoptic_deeplab_head_depthwise_norm_{}'.format(i)
+        self._norms.append(bn_op(name=norm_name, **bn_kwargs))
+      conv_name = 'panoptic_deeplab_head_conv_{}'.format(i)
+      self._convs.append(
+          conv_op(
+              name=conv_name,
+              filters=self._config_dict['num_filters'],
+              **conv_kwargs))
+      norm_name = 'panoptic_deeplab_head_norm_{}'.format(i)
+      self._norms.append(bn_op(name=norm_name, **bn_kwargs))
+    super().build(input_shape)
+  def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]],
+                               Union[tf.Tensor, Mapping[str, tf.Tensor]]],
+           training=None):
+    """Forward pass of the head.
+    It supports both a tuple of 2 tensors or 2 dictionaries. The first is
+    backbone endpoints, and the second is decoder endpoints. When inputs are
+    tensors, they are from a single level of feature maps. When inputs are
+    dictionaries, they contain multiple levels of feature maps, where the key
+    is the index of feature map.
+    Args:
+      inputs: A tuple of 2 feature map tensors of shape
+        [batch, height_l, width_l, channels] or 2 dictionaries of tensors:
+        - key: A `str` of the level of the multilevel features.
+        - values: A `tf.Tensor` of the feature map tensors, whose shape is
+            [batch, height_l, width_l, channels].
+      training: A bool, runs the model in training/eval mode.
+    Returns:
+      A `tf.Tensor` of the fused backbone and decoder features.
+    """
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    x = self._panoptic_deeplab_fusion(inputs, training=training)
+    for conv, norm in zip(self._convs, self._norms):
+      x = conv(x)
+      x = norm(x, training=training)
+      x = self._activation(x)
+    if self._config_dict['upsample_factor'] > 1:
+      x = spatial_transform_ops.nearest_upsampling(
+          x, scale=self._config_dict['upsample_factor'])
+    return x
+  def get_config(self):
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(self._config_dict.items()))
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SemanticHead(PanopticDeeplabHead):
+  """Creates a semantic head."""
+  def __init__(
+      self,
+      num_classes: int,
+      level: Union[int, str],
+      num_convs: int = 2,
+      num_filters: int = 256,
+      kernel_size: int = 3,
+      prediction_kernel_size: int = 3,
+      use_depthwise_convolution: bool = False,
+      upsample_factor: int = 1,
+      low_level: Optional[List[int]] = None,
+      low_level_num_filters: Optional[List[int]] = None,
+      fusion_num_output_filters: int = 256,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a instance center head.
+    Args:
+      num_classes: An `int` number of mask classification categories. The number
+        of classes does not include background class.
+      level: An `int` or `str`, level to use to build head.
+      num_convs: An `int` number of stacked convolution before the last
+        prediction layer.
+      num_filters: An `int` number to specify the number of filters used.
+        Default is 256.
+      kernel_size:  An `int` number to specify the kernel size of the
+        stacked convolutions before the last prediction layer.
+      prediction_kernel_size: An `int` number to specify the kernel size of the
+        prediction layer.
+      use_depthwise_convolution: A bool to specify if use depthwise separable
+        convolutions.
+      upsample_factor: An `int` number to specify the upsampling factor to
+        generate finer mask. Default 1 means no upsampling is applied.
+      low_level: An `int` of backbone level to be used for feature fusion. It is
+        used when feature_fusion is set to `deeplabv3plus`.
+      low_level_num_filters: An `int` of reduced number of filters for the low
+        level features before fusing it with higher level features. It is only
+        used when feature_fusion is set to `deeplabv3plus`.
+      fusion_num_output_filters: An `int` number to specify the number of
+        filters used by output layer of fusion module. Default is 256.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(SemanticHead, self).__init__(
+        level=level,
+        num_convs=num_convs,
+        num_filters=num_filters,
+        use_depthwise_convolution=use_depthwise_convolution,
+        kernel_size=kernel_size,
+        upsample_factor=upsample_factor,
+        low_level=low_level,
+        low_level_num_filters=low_level_num_filters,
+        fusion_num_output_filters=fusion_num_output_filters,
+        activation=activation,
+        use_sync_bn=use_sync_bn,
+        norm_momentum=norm_momentum,
+        norm_epsilon=norm_epsilon,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        **kwargs)
+    self._config_dict.update({
+        'num_classes': num_classes,
+        'prediction_kernel_size': prediction_kernel_size})
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the semantic head."""
+    super(SemanticHead, self).build(input_shape)
+    self._classifier = tf.keras.layers.Conv2D(
+        name='semantic_output',
+        filters=self._config_dict['num_classes'],
+        kernel_size=self._config_dict['prediction_kernel_size'],
+        padding='same',
+        bias_initializer=tf.zeros_initializer(),
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'])
+  def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]],
+                               Union[tf.Tensor, Mapping[str, tf.Tensor]]],
+           training=None):
+    """Forward pass of the head."""
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    x = super(SemanticHead, self).call(inputs, training=training)
+    outputs = self._classifier(x)
+    return outputs
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class InstanceHead(PanopticDeeplabHead):
+  """Creates a instance head."""
+  def __init__(
+      self,
+      level: Union[int, str],
+      num_convs: int = 2,
+      num_filters: int = 256,
+      kernel_size: int = 3,
+      prediction_kernel_size: int = 3,
+      use_depthwise_convolution: bool = False,
+      upsample_factor: int = 1,
+      low_level: Optional[List[int]] = None,
+      low_level_num_filters: Optional[List[int]] = None,
+      fusion_num_output_filters: int = 256,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      **kwargs):
+    """Initializes a instance center head.
+    Args:
+      level: An `int` or `str`, level to use to build head.
+      num_convs: An `int` number of stacked convolution before the last
+        prediction layer.
+      num_filters: An `int` number to specify the number of filters used.
+        Default is 256.
+      kernel_size:  An `int` number to specify the kernel size of the
+        stacked convolutions before the last prediction layer.
+      prediction_kernel_size: An `int` number to specify the kernel size of the
+        prediction layer.
+      use_depthwise_convolution: A bool to specify if use depthwise separable
+        convolutions.
+      upsample_factor: An `int` number to specify the upsampling factor to
+        generate finer mask. Default 1 means no upsampling is applied.
+      low_level: An `int` of backbone level to be used for feature fusion. It is
+        used when feature_fusion is set to `deeplabv3plus`.
+      low_level_num_filters: An `int` of reduced number of filters for the low
+        level features before fusing it with higher level features. It is only
+        used when feature_fusion is set to `deeplabv3plus`.
+      fusion_num_output_filters: An `int` number to specify the number of
+        filters used by output layer of fusion module. Default is 256.
+      activation: A `str` that indicates which activation is used, e.g. 'relu',
+        'swish', etc.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      **kwargs: Additional keyword arguments to be passed.
+    """
+    super(InstanceHead, self).__init__(
+        level=level,
+        num_convs=num_convs,
+        num_filters=num_filters,
+        use_depthwise_convolution=use_depthwise_convolution,
+        kernel_size=kernel_size,
+        upsample_factor=upsample_factor,
+        low_level=low_level,
+        low_level_num_filters=low_level_num_filters,
+        fusion_num_output_filters=fusion_num_output_filters,
+        activation=activation,
+        use_sync_bn=use_sync_bn,
+        norm_momentum=norm_momentum,
+        norm_epsilon=norm_epsilon,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        **kwargs)
+    self._config_dict.update({
+        'prediction_kernel_size': prediction_kernel_size})
+  def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]):
+    """Creates the variables of the instance head."""
+    super(InstanceHead, self).build(input_shape)
+    self._instance_center_prediction_conv = tf.keras.layers.Conv2D(
+        name='instance_centers_heatmap',
+        filters=1,
+        kernel_size=self._config_dict['prediction_kernel_size'],
+        padding='same',
+        bias_initializer=tf.zeros_initializer(),
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'])
+    self._instance_center_regression_conv = tf.keras.layers.Conv2D(
+        name='instance_centers_offset',
+        filters=2,
+        kernel_size=self._config_dict['prediction_kernel_size'],
+        padding='same',
+        bias_initializer=tf.zeros_initializer(),
+        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
+        kernel_regularizer=self._config_dict['kernel_regularizer'],
+        bias_regularizer=self._config_dict['bias_regularizer'])
+  def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]],
+                               Union[tf.Tensor, Mapping[str, tf.Tensor]]],
+           training=None):
+    """Forward pass of the head."""
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    x = super(InstanceHead, self).call(inputs, training=training)
+    instance_centers_heatmap = self._instance_center_prediction_conv(x)
+    instance_centers_offset = self._instance_center_regression_conv(x)
+    outputs = {
+        'instance_centers_heatmap': instance_centers_heatmap,
+        'instance_centers_offset': instance_centers_offset
+    }
+    return outputs
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/heads/panoptic_deeplab_heads_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/heads/panoptic_deeplab_heads_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for panoptic_deeplab_heads.py."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from official.vision.beta.projects.panoptic_maskrcnn.modeling.heads import panoptic_deeplab_heads
+class PanopticDeeplabHeadsTest(parameterized.TestCase, tf.test.TestCase):
+  @parameterized.parameters(
+      (2, (2,), (48,)),
+      (3, (2,), (48,)),
+      (2, (2,), (48,)),
+      (2, (2,), (48,)),
+      (3, (2,), (48,)),
+      (3, (2,), (48,)),
+      (4, (4, 3), (64, 32)),
+      (4, (3, 2), (64, 32)))
+  def test_forward(self, level, low_level, low_level_num_filters):
+    backbone_features = {
+        '3': np.random.rand(2, 128, 128, 16),
+        '4': np.random.rand(2, 64, 64, 16),
+        '5': np.random.rand(2, 32, 32, 16),
+    }
+    decoder_features = {
+        '3': np.random.rand(2, 128, 128, 64),
+        '4': np.random.rand(2, 64, 64, 64),
+        '5': np.random.rand(2, 32, 32, 64),
+        '6': np.random.rand(2, 16, 16, 64),
+    }
+    backbone_features['2'] = np.random.rand(2, 256, 256, 16)
+    decoder_features['2'] = np.random.rand(2, 256, 256, 64)
+    num_classes = 10
+    semantic_head = panoptic_deeplab_heads.SemanticHead(
+        num_classes=num_classes,
+        level=level,
+        low_level=low_level,
+        low_level_num_filters=low_level_num_filters)
+    instance_head = panoptic_deeplab_heads.InstanceHead(
+        level=level,
+        low_level=low_level,
+        low_level_num_filters=low_level_num_filters)
+    semantic_outputs = semantic_head((backbone_features, decoder_features))
+    instance_outputs = instance_head((backbone_features, decoder_features))
+    if str(level) in decoder_features:
+      h, w = decoder_features[str(low_level[-1])].shape[1:3]
+      self.assertAllEqual(
+          semantic_outputs.numpy().shape,
+          [2, h, w, num_classes])
+      self.assertAllEqual(
+          instance_outputs['instance_centers_heatmap'].numpy().shape,
+          [2, h, w, 1])
+      self.assertAllEqual(
+          instance_outputs['instance_centers_offset'].numpy().shape,
+          [2, h, w, 2])
+  def test_serialize_deserialize(self):
+    semantic_head = panoptic_deeplab_heads.SemanticHead(num_classes=2, level=3)
+    instance_head = panoptic_deeplab_heads.InstanceHead(level=3)
+    semantic_head_config = semantic_head.get_config()
+    instance_head_config = instance_head.get_config()
+    new_semantic_head = panoptic_deeplab_heads.SemanticHead.from_config(
+        semantic_head_config)
+    new_instance_head = panoptic_deeplab_heads.InstanceHead.from_config(
+        instance_head_config)
+    self.assertAllEqual(semantic_head.get_config(),
+                        new_semantic_head.get_config())
+    self.assertAllEqual(instance_head.get_config(),
+                        new_instance_head.get_config())
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/layers/fusion_layers.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/layers/fusion_layers.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains feature fusion blocks for panoptic segmentation models."""
+from typing import Any, Callable, Dict, List, Mapping, Optional, Union
+import tensorflow as tf
+from official.modeling import tf_utils
+# Type annotations.
+States = Dict[str, tf.Tensor]
+Activation = Union[str, Callable]
+class PanopticDeepLabFusion(tf.keras.layers.Layer):
+  """Creates a Panoptic DeepLab feature Fusion layer.
+  This implements the feature fusion introduced in the paper:
+  Cheng et al. Panoptic-DeepLab
+  (https://arxiv.org/pdf/1911.10194.pdf)
+  """
+  def __init__(
+      self,
+      level: int,
+      low_level: List[int],
+      num_projection_filters: List[int],
+      num_output_filters: int = 256,
+      use_depthwise_convolution: bool = False,
+      activation: str = 'relu',
+      use_sync_bn: bool = False,
+      norm_momentum: float = 0.99,
+      norm_epsilon: float = 0.001,
+      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
+      interpolation: str = 'bilinear',
+      **kwargs):
+    """Initializes panoptic FPN feature fusion layer.
+    Args:
+      level: An `int` level at which the decoder was appled at.
+      low_level: A list of `int` of minimum level to use in feature fusion.
+      num_projection_filters: A list of `int` with number of filters for
+        projection conv2d layers.
+      num_output_filters: An `int` number of filters in output conv2d layers.
+      use_depthwise_convolution: A bool to specify if use depthwise separable
+        convolutions.
+      activation: A `str` name of the activation function.
+      use_sync_bn: A `bool` that indicates whether to use synchronized batch
+        normalization across different replicas.
+      norm_momentum: A `float` of normalization momentum for the moving average.
+      norm_epsilon: A `float` added to variance to avoid dividing by zero.
+      kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for
+        Conv2D. Default is None.
+      bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D.
+      interpolation: A `str` interpolation method for upsampling. Defaults to
+        `bilinear`.
+      **kwargs: Additional keyword arguments to be passed.
+    Returns:
+      A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width,
+        feature_channel].
+    """
+    super(PanopticDeepLabFusion, self).__init__(**kwargs)
+    self._config_dict = {
+        'level': level,
+        'low_level': low_level,
+        'num_projection_filters': num_projection_filters,
+        'num_output_filters': num_output_filters,
+        'use_depthwise_convolution': use_depthwise_convolution,
+        'activation': activation,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'kernel_regularizer': kernel_regularizer,
+        'bias_regularizer': bias_regularizer,
+        'interpolation': interpolation
+    }
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._channel_axis = -1
+    else:
+      self._channel_axis = 1
+    self._activation = tf_utils.get_activation(activation)
+  def build(self, input_shape: List[tf.TensorShape]):
+    conv_op = tf.keras.layers.Conv2D
+    conv_kwargs = {
+        'padding': 'same',
+        'use_bias': True,
+        'kernel_initializer': tf.initializers.VarianceScaling(),
+        'kernel_regularizer': self._config_dict['kernel_regularizer'],
+    }
+    bn_op = (tf.keras.layers.experimental.SyncBatchNormalization
+             if self._config_dict['use_sync_bn']
+             else tf.keras.layers.BatchNormalization)
+    bn_kwargs = {
+        'axis': self._channel_axis,
+        'momentum': self._config_dict['norm_momentum'],
+        'epsilon': self._config_dict['norm_epsilon'],
+    }
+    self._projection_convs = []
+    self._projection_norms = []
+    self._fusion_convs = []
+    self._fusion_norms = []
+    for i in range(len(self._config_dict['low_level'])):
+      self._projection_convs.append(
+          conv_op(
+              filters=self._config_dict['num_projection_filters'][i],
+              kernel_size=1,
+              **conv_kwargs))
+      if self._config_dict['use_depthwise_convolution']:
+        depthwise_initializer = tf.keras.initializers.RandomNormal(stddev=0.01)
+        fusion_conv = tf.keras.Sequential([
+            tf.keras.layers.DepthwiseConv2D(
+                kernel_size=5,
+                padding='same',
+                use_bias=True,
+                depthwise_initializer=depthwise_initializer,
+                depthwise_regularizer=self._config_dict['kernel_regularizer'],
+                depth_multiplier=1),
+            bn_op(**bn_kwargs),
+            conv_op(
+                filters=self._config_dict['num_output_filters'],
+                kernel_size=1,
+                **conv_kwargs)])
+      else:
+        fusion_conv = conv_op(
+            filters=self._config_dict['num_output_filters'],
+            kernel_size=5,
+            **conv_kwargs)
+      self._fusion_convs.append(fusion_conv)
+      self._projection_norms.append(bn_op(**bn_kwargs))
+      self._fusion_norms.append(bn_op(**bn_kwargs))
+  def call(self, inputs, training=None):
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    backbone_output = inputs[0]
+    decoder_output = inputs[1][str(self._config_dict['level'])]
+    x = decoder_output
+    for i in range(len(self._config_dict['low_level'])):
+      feature = backbone_output[str(self._config_dict['low_level'][i])]
+      feature = self._projection_convs[i](feature)
+      feature = self._projection_norms[i](feature, training=training)
+      feature = self._activation(feature)
+      shape = tf.shape(feature)
+      x = tf.image.resize(
+          x, size=[shape[1], shape[2]],
+          method=self._config_dict['interpolation'])
+      x = tf.cast(x, dtype=feature.dtype)
+      x = tf.concat([x, feature], axis=self._channel_axis)
+      x = self._fusion_convs[i](x)
+      x = self._fusion_norms[i](x, training=training)
+      x = self._activation(x)
+    return x
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/layers/panoptic_deeplab_merge.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/layers/panoptic_deeplab_merge.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This file contains functions to post-process Panoptic-DeepLab results.
+Note that the postprocessing class and the supporting functions are branched
+from:
+https://github.com/google-research/deeplab2/blob/main/model/post_processor/panoptic_deeplab.py
+with minor changes.
+"""
+import functools
+from typing import List, Tuple, Dict, Text
+import tensorflow as tf
+from official.vision.beta.projects.panoptic_maskrcnn.ops import mask_ops
+def _add_zero_padding(input_tensor: tf.Tensor, kernel_size: int,
+                      rank: int) -> tf.Tensor:
+  """Adds zero-padding to the input_tensor."""
+  pad_total = kernel_size - 1
+  pad_begin = pad_total // 2
+  pad_end = pad_total - pad_begin
+  if rank == 3:
+    return tf.pad(
+        input_tensor,
+        paddings=[[pad_begin, pad_end], [pad_begin, pad_end], [0, 0]])
+  else:
+    return tf.pad(
+        input_tensor,
+        paddings=[[0, 0], [pad_begin, pad_end], [pad_begin, pad_end], [0, 0]])
+def _get_semantic_predictions(semantic_logits: tf.Tensor) -> tf.Tensor:
+  """Computes the semantic classes from the predictions.
+  Args:
+    semantic_logits: A tf.tensor of shape [batch, height, width, classes].
+  Returns:
+    A tf.Tensor containing the semantic class prediction of shape
+      [batch, height, width].
+  """
+  return tf.argmax(semantic_logits, axis=-1, output_type=tf.int32)
+def _get_instance_centers_from_heatmap(
+        center_heatmap: tf.Tensor,
+        center_threshold: float,
+        nms_kernel_size: int,
+        keep_k_centers: int) -> Tuple[tf.Tensor, tf.Tensor]:
+  """Computes a list of instance centers.
+  Args:
+    center_heatmap: A tf.Tensor of shape [height, width, 1].
+    center_threshold: A float setting the threshold for the center heatmap.
+    nms_kernel_size: An integer specifying the nms kernel size.
+    keep_k_centers: An integer specifying the number of centers to keep (K).
+      Non-positive values will keep all centers.
+  Returns:
+    A tuple of
+    - tf.Tensor of shape [N, 2] containing N center coordinates (after
+      non-maximum suppression) in (y, x) order.
+    - tf.Tensor of shape [height, width] containing the center heatmap after
+      non-maximum suppression.
+  """
+  # Threshold center map.
+  center_heatmap = tf.where(
+      tf.greater(center_heatmap, center_threshold), center_heatmap, 0.0)
+  # Non-maximum suppression.
+  padded_map = _add_zero_padding(center_heatmap, nms_kernel_size, rank=3)
+  pooled_center_heatmap = tf.keras.backend.pool2d(
+      tf.expand_dims(padded_map, 0),
+      pool_size=(nms_kernel_size, nms_kernel_size),
+      strides=(1, 1),
+      padding='valid',
+      pool_mode='max')
+  center_heatmap = tf.where(
+      tf.equal(pooled_center_heatmap, center_heatmap), center_heatmap, 0.0)
+  center_heatmap = tf.squeeze(center_heatmap, axis=[0, 3])
+  # `centers` is of shape (N, 2) with (y, x) order of the second dimension.
+  centers = tf.where(tf.greater(center_heatmap, 0.0))
+  if keep_k_centers > 0 and tf.shape(centers)[0] > keep_k_centers:
+    topk_scores, _ = tf.math.top_k(
+        tf.reshape(center_heatmap, [-1]), keep_k_centers, sorted=False)
+    centers = tf.where(tf.greater(center_heatmap, topk_scores[-1]))
+  return centers, center_heatmap
+def _find_closest_center_per_pixel(centers: tf.Tensor,
+                                   center_offsets: tf.Tensor) -> tf.Tensor:
+  """Assigns all pixels to their closest center.
+  Args:
+    centers: A tf.Tensor of shape [N, 2] containing N centers with coordinate
+      order (y, x).
+    center_offsets: A tf.Tensor of shape [height, width, 2].
+  Returns:
+    A tf.Tensor of shape [height, width] containing the index of the closest
+      center, per pixel.
+  """
+  height = tf.shape(center_offsets)[0]
+  width = tf.shape(center_offsets)[1]
+  x_coord, y_coord = tf.meshgrid(tf.range(width), tf.range(height))
+  coord = tf.stack([y_coord, x_coord], axis=-1)
+  center_per_pixel = tf.cast(coord, tf.float32) + center_offsets
+  # centers: [N, 2] -> [N, 1, 2].
+  # center_per_pixel: [H, W, 2] -> [1, H*W, 2].
+  centers = tf.cast(tf.expand_dims(centers, 1), tf.float32)
+  center_per_pixel = tf.reshape(center_per_pixel, [height*width, 2])
+  center_per_pixel = tf.expand_dims(center_per_pixel, 0)
+  # distances: [N, H*W].
+  distances = tf.norm(centers - center_per_pixel, axis=-1)
+  return tf.reshape(tf.argmin(distances, axis=0), [height, width])
+def _get_instances_from_heatmap_and_offset(
+        semantic_segmentation: tf.Tensor, center_heatmap: tf.Tensor,
+        center_offsets: tf.Tensor, center_threshold: float,
+        thing_class_ids: tf.Tensor, nms_kernel_size: int,
+        keep_k_centers: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+  """Computes the instance assignment per pixel.
+  Args:
+    semantic_segmentation: A tf.Tensor containing the semantic labels of shape
+      [height, width].
+    center_heatmap: A tf.Tensor of shape [height, width, 1].
+    center_offsets: A tf.Tensor of shape [height, width, 2].
+    center_threshold: A float setting the threshold for the center heatmap.
+    thing_class_ids: A tf.Tensor of shape [N] containing N thing indices.
+    nms_kernel_size: An integer specifying the nms kernel size.
+    keep_k_centers: An integer specifying the number of centers to keep.
+      Negative values will keep all centers.
+  Returns:
+    A tuple of:
+    - tf.Tensor containing the instance segmentation (filtered with the `thing`
+      segmentation from the semantic segmentation output) with shape
+      [height, width].
+    - tf.Tensor containing the processed centermap with shape [height, width].
+    - tf.Tensor containing instance scores (where higher "score" is a reasonable
+      signal of a higher confidence detection.) Will be of shape [height, width]
+      with the score for a pixel being the score of the instance it belongs to.
+      The scores will be zero for pixels in background/"stuff" regions.
+  """
+  thing_segmentation = tf.zeros_like(semantic_segmentation)
+  for thing_id in thing_class_ids:
+    thing_segmentation = tf.where(tf.equal(semantic_segmentation, thing_id),
+                                  1,
+                                  thing_segmentation)
+  centers, processed_center_heatmap = _get_instance_centers_from_heatmap(
+      center_heatmap, center_threshold, nms_kernel_size, keep_k_centers)
+  if tf.shape(centers)[0] == 0:
+    return (tf.zeros_like(semantic_segmentation), processed_center_heatmap,
+            tf.zeros_like(processed_center_heatmap))
+  instance_center_index = _find_closest_center_per_pixel(
+      centers, center_offsets)
+  # Instance IDs should start with 1. So we use the index into the centers, but
+  # shifted by 1.
+  instance_segmentation = tf.cast(instance_center_index, tf.int32) + 1
+  # The value of the heatmap at an instance's center is used as the score
+  # for that instance.
+  instance_scores = tf.gather_nd(processed_center_heatmap, centers)
+  # This will map the instance scores back to the image space: where each pixel
+  # has a value equal to the score of its instance.
+  flat_center_index = tf.reshape(instance_center_index, [-1])
+  instance_score_map = tf.gather(instance_scores, flat_center_index)
+  instance_score_map = tf.reshape(instance_score_map,
+                                  tf.shape(instance_segmentation))
+  instance_score_map *= tf.cast(thing_segmentation, tf.float32)
+  return (thing_segmentation * instance_segmentation, processed_center_heatmap,
+          instance_score_map)
+@tf.function
+def _get_panoptic_predictions(
+    semantic_logits: tf.Tensor, center_heatmap: tf.Tensor,
+    center_offsets: tf.Tensor, center_threshold: float,
+    thing_class_ids: tf.Tensor, label_divisor: int, stuff_area_limit: int,
+    void_label: int, nms_kernel_size: int, keep_k_centers: int
+) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
+  """Computes the semantic class and instance ID per pixel.
+  Args:
+    semantic_logits: A tf.Tensor of shape [batch, height, width, classes].
+    center_heatmap: A tf.Tensor of shape [batch, height, width, 1].
+    center_offsets: A tf.Tensor of shape [batch, height, width, 2].
+    center_threshold: A float setting the threshold for the center heatmap.
+    thing_class_ids: A tf.Tensor of shape [N] containing N thing indices.
+    label_divisor: An integer specifying the label divisor of the dataset.
+    stuff_area_limit: An integer specifying the number of pixels that stuff
+      regions need to have at least. The stuff region will be included in the
+      panoptic prediction, only if its area is larger than the limit; otherwise,
+      it will be re-assigned as void_label.
+    void_label: An integer specifying the void label.
+    nms_kernel_size: An integer specifying the nms kernel size.
+    keep_k_centers: An integer specifying the number of centers to keep.
+      Negative values will keep all centers.
+  Returns:
+    A tuple of:
+    - the panoptic prediction as tf.Tensor with shape [batch, height, width].
+    - the centermap prediction as tf.Tensor with shape [batch, height, width].
+    - the instance score maps as tf.Tensor with shape [batch, height, width].
+    - the instance prediction as tf.Tensor with shape [batch, height, width].
+  """
+  semantic_prediction = _get_semantic_predictions(semantic_logits)
+  batch_size = tf.shape(semantic_logits)[0]
+  instance_map_lists = tf.TensorArray(
+      tf.int32, size=batch_size, dynamic_size=False)
+  center_map_lists = tf.TensorArray(
+      tf.float32, size=batch_size, dynamic_size=False)
+  instance_score_map_lists = tf.TensorArray(
+      tf.float32, size=batch_size, dynamic_size=False)
+  for i in tf.range(batch_size):
+    (instance_map, center_map,
+     instance_score_map) = _get_instances_from_heatmap_and_offset(
+         semantic_prediction[i, ...], center_heatmap[i, ...],
+         center_offsets[i, ...], center_threshold, thing_class_ids,
+         nms_kernel_size, keep_k_centers)
+    instance_map_lists = instance_map_lists.write(i, instance_map)
+    center_map_lists = center_map_lists.write(i, center_map)
+    instance_score_map_lists = instance_score_map_lists.write(
+        i, instance_score_map)
+  # This does not work with unknown shapes.
+  instance_maps = instance_map_lists.stack()
+  center_maps = center_map_lists.stack()
+  instance_score_maps = instance_score_map_lists.stack()
+  panoptic_prediction = _merge_semantic_and_instance_maps(
+      semantic_prediction, instance_maps, thing_class_ids, label_divisor,
+      stuff_area_limit, void_label)
+  return (panoptic_prediction, center_maps, instance_score_maps, instance_maps)
+@tf.function
+def _merge_semantic_and_instance_maps(
+        semantic_prediction: tf.Tensor,
+        instance_maps: tf.Tensor,
+        thing_class_ids: tf.Tensor,
+        label_divisor: int,
+        stuff_area_limit: int,
+        void_label: int) -> tf.Tensor:
+  """Merges semantic and instance maps to obtain panoptic segmentation.
+  This function merges the semantic segmentation and class-agnostic
+  instance segmentation to form the panoptic segmentation. In particular,
+  the class label of each instance mask is inferred from the majority
+  votes from the corresponding pixels in the semantic segmentation. This
+  operation is first proposed in the DeeperLab paper and adopted by the
+  Panoptic-DeepLab.
+  - DeeperLab: Single-Shot Image Parser, T-J Yang, et al. arXiv:1902.05093.
+  - Panoptic-DeepLab, B. Cheng, et al. In CVPR, 2020.
+  Note that this function only supports batch = 1 for simplicity. Additionally,
+  this function has a slightly different implementation from the provided
+  TensorFlow implementation `merge_ops` but with a similar performance. This
+  function is mainly used as a backup solution when you could not successfully
+  compile the provided TensorFlow implementation. To reproduce our results,
+  please use the provided TensorFlow implementation (i.e., not use this
+  function, but the `merge_ops.merge_semantic_and_instance_maps`).
+  Args:
+    semantic_prediction: A tf.Tensor of shape [batch, height, width].
+    instance_maps: A tf.Tensor of shape [batch, height, width].
+    thing_class_ids: A tf.Tensor of shape [N] containing N thing indices.
+    label_divisor: An integer specifying the label divisor of the dataset.
+    stuff_area_limit: An integer specifying the number of pixels that stuff
+      regions need to have at least. The stuff region will be included in the
+      panoptic prediction, only if its area is larger than the limit; otherwise,
+      it will be re-assigned as void_label.
+    void_label: An integer specifying the void label.
+  Returns:
+    panoptic_prediction: A tf.Tensor with shape [batch, height, width].
+  """
+  prediction_shape = semantic_prediction.get_shape().as_list()
+  # This implementation only supports batch size of 1. Since model construction
+  # might lose batch size information (and leave it to None), override it here.
+  prediction_shape[0] = 1
+  semantic_prediction = tf.ensure_shape(semantic_prediction, prediction_shape)
+  instance_maps = tf.ensure_shape(instance_maps, prediction_shape)
+  # Default panoptic_prediction to have semantic label = void_label.
+  panoptic_prediction = tf.ones_like(
+      semantic_prediction) * void_label * label_divisor
+  # Start to paste predicted `thing` regions to panoptic_prediction.
+  # Infer `thing` segmentation regions from semantic prediction.
+  semantic_thing_segmentation = tf.zeros_like(semantic_prediction,
+                                              dtype=tf.bool)
+  for thing_class in thing_class_ids:
+    semantic_thing_segmentation = tf.math.logical_or(
+        semantic_thing_segmentation,
+        semantic_prediction == thing_class)
+  # Keep track of how many instances for each semantic label.
+  num_instance_per_semantic_label = tf.TensorArray(
+      tf.int32, size=0, dynamic_size=True, clear_after_read=False)
+  instance_ids, _ = tf.unique(tf.reshape(instance_maps, [-1]))
+  for instance_id in instance_ids:
+    # Instance ID 0 is reserved for crowd region.
+    if instance_id == 0:
+      continue
+    thing_mask = tf.math.logical_and(instance_maps == instance_id,
+                                     semantic_thing_segmentation)
+    if tf.reduce_sum(tf.cast(thing_mask, tf.int32)) == 0:
+      continue
+    semantic_bin_counts = tf.math.bincount(
+        tf.boolean_mask(semantic_prediction, thing_mask))
+    semantic_majority = tf.cast(
+        tf.math.argmax(semantic_bin_counts), tf.int32)
+    while num_instance_per_semantic_label.size() <= semantic_majority:
+      num_instance_per_semantic_label = num_instance_per_semantic_label.write(
+          num_instance_per_semantic_label.size(), 0)
+    new_instance_id = (
+        num_instance_per_semantic_label.read(semantic_majority) + 1)
+    num_instance_per_semantic_label = num_instance_per_semantic_label.write(
+        semantic_majority, new_instance_id)
+    panoptic_prediction = tf.where(
+        thing_mask,
+        tf.ones_like(panoptic_prediction) * semantic_majority * label_divisor
+        + new_instance_id,
+        panoptic_prediction)
+  # Done with `num_instance_per_semantic_label` tensor array.
+  num_instance_per_semantic_label.close()
+  # Start to paste predicted `stuff` regions to panoptic prediction.
+  instance_stuff_regions = instance_maps == 0
+  semantic_ids, _ = tf.unique(tf.reshape(semantic_prediction, [-1]))
+  for semantic_id in semantic_ids:
+    if tf.reduce_sum(tf.cast(thing_class_ids == semantic_id, tf.int32)) > 0:
+      continue
+    # Check stuff area.
+    stuff_mask = tf.math.logical_and(semantic_prediction == semantic_id,
+                                     instance_stuff_regions)
+    stuff_area = tf.reduce_sum(tf.cast(stuff_mask, tf.int32))
+    if stuff_area >= stuff_area_limit:
+      panoptic_prediction = tf.where(
+          stuff_mask,
+          tf.ones_like(panoptic_prediction) * semantic_id * label_divisor,
+          panoptic_prediction)
+  return panoptic_prediction
+class PostProcessor(tf.keras.layers.Layer):
+  """This class contains code of a Panoptic-Deeplab post-processor."""
+  def __init__(
+      self,
+      output_size: List[int],
+      center_score_threshold: float,
+      thing_class_ids: List[int],
+      label_divisor: int,
+      stuff_area_limit: int,
+      ignore_label: int,
+      nms_kernel: int,
+      keep_k_centers: int,
+      rescale_predictions: bool,
+      **kwargs):
+    """Initializes a Panoptic-Deeplab post-processor.
+    Args:
+      output_size: A `List` of integers that represent the height and width of
+        the output mask.
+      center_score_threshold: A float setting the threshold for the center
+        heatmap.
+      thing_class_ids: An integer list shape [N] containing N thing indices.
+      label_divisor: An integer specifying the label divisor of the dataset.
+      stuff_area_limit: An integer specifying the number of pixels that stuff
+        regions need to have at least. The stuff region will be included in the
+        panoptic prediction, only if its area is larger than the limit;
+        otherwise, it will be re-assigned as void_label.
+      ignore_label: An integer specifying the void label.
+      nms_kernel: An integer specifying the nms kernel size.
+      keep_k_centers: An integer specifying the number of centers to keep.
+        Negative values will keep all centers.
+      rescale_predictions: `bool`, whether to scale back prediction to original
+        image sizes. If True, image_info is used to rescale predictions.
+      **kwargs: additional kwargs arguments.
+    """
+    super(PostProcessor, self).__init__(**kwargs)
+    self._config_dict = {
+        'output_size': output_size,
+        'center_score_threshold': center_score_threshold,
+        'thing_class_ids': thing_class_ids,
+        'label_divisor': label_divisor,
+        'stuff_area_limit': stuff_area_limit,
+        'ignore_label': ignore_label,
+        'nms_kernel': nms_kernel,
+        'keep_k_centers': keep_k_centers,
+        'rescale_predictions': rescale_predictions
+    }
+    self._post_processor = functools.partial(
+        _get_panoptic_predictions,
+        center_threshold=center_score_threshold,
+        thing_class_ids=tf.convert_to_tensor(thing_class_ids),
+        label_divisor=label_divisor,
+        stuff_area_limit=stuff_area_limit,
+        void_label=ignore_label,
+        nms_kernel_size=nms_kernel,
+        keep_k_centers=keep_k_centers)
+  def _resize_and_pad_masks(self, mask, image_info):
+    """Resizes masks to match the original image shape and pads to`output_size`.
+    Args:
+      mask: a padded mask tensor.
+      image_info: a tensor that holds information about original and
+        preprocessed images.
+    Returns:
+      resized and padded masks: tf.Tensor.
+    """
+    rescale_size = tf.cast(
+        tf.math.ceil(image_info[1, :] / image_info[2, :]), tf.int32)
+    image_shape = tf.cast(image_info[0, :], tf.int32)
+    offsets = tf.cast(image_info[3, :], tf.int32)
+    mask = tf.image.resize(
+        mask,
+        rescale_size,
+        method='bilinear')
+    mask = tf.image.crop_to_bounding_box(
+        mask,
+        offsets[0], offsets[1],
+        image_shape[0],
+        image_shape[1])
+    mask = tf.image.pad_to_bounding_box(
+        mask, 0, 0,
+        self._config_dict['output_size'][0],
+        self._config_dict['output_size'][1])
+    return mask
+  def _resize_and_pad_offset_mask(self, mask, image_info):
+    """Rescales and resizes offset masks and pads to`output_size`.
+    Args:
+      mask: a padded offset mask tensor.
+      image_info: a tensor that holds information about original and
+        preprocessed images.
+    Returns:
+      rescaled, resized and padded masks: tf.Tensor.
+    """
+    rescale_size = tf.cast(
+        tf.math.ceil(image_info[1, :] / image_info[2, :]), tf.int32)
+    image_shape = tf.cast(image_info[0, :], tf.int32)
+    offsets = tf.cast(image_info[3, :], tf.int32)
+    mask = mask_ops.resize_and_rescale_offsets(
+        tf.expand_dims(mask, axis=0),
+        rescale_size)[0]
+    mask = tf.image.crop_to_bounding_box(
+        mask,
+        offsets[0], offsets[1],
+        image_shape[0],
+        image_shape[1])
+    mask = tf.image.pad_to_bounding_box(
+        mask, 0, 0,
+        self._config_dict['output_size'][0],
+        self._config_dict['output_size'][1])
+    return mask
+  def call(
+      self,
+      result_dict: Dict[Text, tf.Tensor],
+      image_info: tf.Tensor) -> Dict[Text, tf.Tensor]:
+    """Performs the post-processing given model predicted results.
+    Args:
+      result_dict: A dictionary of tf.Tensor containing model results. The dict
+      has to contain
+        - segmentation_outputs
+        - instance_centers_heatmap
+        - instance_centers_offset
+      image_info: A tf.Tensor of image infos.
+    Returns:
+      The post-processed dict of tf.Tensor, containing the following keys:
+        - panoptic_outputs
+        - category_mask
+        - instance_mask
+        - instance_centers
+        - instance_score
+    """
+    if self._config_dict['rescale_predictions']:
+      segmentation_outputs = tf.map_fn(
+          fn=lambda x: self._resize_and_pad_masks(x[0], x[1]),
+          elems=(result_dict['segmentation_outputs'], image_info),
+          fn_output_signature=tf.float32,
+          parallel_iterations=32)
+      instance_centers_heatmap = tf.map_fn(
+          fn=lambda x: self._resize_and_pad_masks(x[0], x[1]),
+          elems=(result_dict['instance_centers_heatmap'], image_info),
+          fn_output_signature=tf.float32,
+          parallel_iterations=32)
+      instance_centers_offset = tf.map_fn(
+          fn=lambda x: self._resize_and_pad_offset_mask(x[0], x[1]),
+          elems=(result_dict['instance_centers_offset'], image_info),
+          fn_output_signature=tf.float32,
+          parallel_iterations=32)
+    else:
+      segmentation_outputs = tf.image.resize(
+          result_dict['segmentation_outputs'],
+          size=self._config_dict['output_size'],
+          method='bilinear')
+      instance_centers_heatmap = tf.image.resize(
+          result_dict['instance_centers_heatmap'],
+          size=self._config_dict['output_size'],
+          method='bilinear')
+      instance_centers_offset = mask_ops.resize_and_rescale_offsets(
+          result_dict['instance_centers_offset'],
+          target_size=self._config_dict['output_size'])
+    processed_dict = {}
+    (processed_dict['panoptic_outputs'],
+     processed_dict['instance_centers'],
+     processed_dict['instance_scores'],
+     _) = self._post_processor(
+         tf.nn.softmax(segmentation_outputs, axis=-1),
+         instance_centers_heatmap,
+         instance_centers_offset)
+    label_divisor = self._config_dict['label_divisor']
+    processed_dict['category_mask'] = (
+        processed_dict['panoptic_outputs'] // label_divisor)
+    processed_dict['instance_mask'] = (
+        processed_dict['panoptic_outputs'] % label_divisor)
+    processed_dict.update({
+        'segmentation_outputs': result_dict['segmentation_outputs']})
+    return processed_dict
+  def get_config(self):
+    return self._config_dict
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/layers/panoptic_deeplab_merge_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/layers/panoptic_deeplab_merge_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test for panoptic_deeplab_merge.py.
+Note that the tests are branched from
+https://raw.githubusercontent.com/google-research/deeplab2/main/model/post_processor/panoptic_deeplab_test.py
+"""
+import numpy as np
+import tensorflow as tf
+from official.vision.beta.projects.panoptic_maskrcnn.modeling.layers import panoptic_deeplab_merge
+class PostProcessingTest(tf.test.TestCase):
+  def test_py_func_merge_semantic_and_instance_maps_can_run(self):
+    batch = 1
+    height = 5
+    width = 5
+    semantic_prediction = tf.random.uniform((batch, height, width),
+                                            minval=0,
+                                            maxval=20,
+                                            dtype=tf.int32)
+    instance_maps = tf.random.uniform((batch, height, width),
+                                      minval=0,
+                                      maxval=3,
+                                      dtype=tf.int32)
+    thing_class_ids = tf.convert_to_tensor([1, 2, 3])
+    label_divisor = 256
+    stuff_area_limit = 3
+    void_label = 255
+    panoptic_prediction = panoptic_deeplab_merge._merge_semantic_and_instance_maps(
+        semantic_prediction, instance_maps, thing_class_ids, label_divisor,
+        stuff_area_limit, void_label)
+    self.assertListEqual(semantic_prediction.get_shape().as_list(),
+                         panoptic_prediction.get_shape().as_list())
+  def test_merge_semantic_and_instance_maps_with_a_simple_example(self):
+    semantic_prediction = tf.convert_to_tensor(
+        [[[0, 0, 0, 0],
+          [0, 1, 1, 0],
+          [0, 2, 2, 0],
+          [2, 2, 3, 3]]], dtype=tf.int32)
+    instance_maps = tf.convert_to_tensor(
+        [[[0, 0, 0, 0],
+          [0, 0, 0, 0],
+          [0, 1, 1, 0],
+          [2, 2, 3, 3]]], dtype=tf.int32)
+    thing_class_ids = tf.convert_to_tensor([2, 3])
+    label_divisor = 256
+    stuff_area_limit = 3
+    void_label = 255
+    # The expected_panoptic_prediction is computed as follows.
+    # For `thing` segmentation, instance 1, 2, and 3 are kept, but instance 3
+    # will have a new instance ID 1, since it is the first instance in its
+    # own semantic label.
+    # For `stuff` segmentation, class-0 region is kept, while class-1 region
+    # is re-labeled as `void_label * label_divisor` since its area is smaller
+    # than stuff_area_limit.
+    expected_panoptic_prediction = tf.convert_to_tensor(
+        [[[0, 0, 0, 0],
+          [0, void_label * label_divisor, void_label * label_divisor, 0],
+          [0, 2 * label_divisor + 1, 2 * label_divisor + 1, 0],
+          [2 * label_divisor + 2, 2 * label_divisor + 2, 3 * label_divisor + 1,
+           3 * label_divisor + 1]]], dtype=tf.int32)
+    panoptic_prediction = panoptic_deeplab_merge._merge_semantic_and_instance_maps(
+        semantic_prediction, instance_maps, thing_class_ids, label_divisor,
+        stuff_area_limit, void_label)
+    self.assertAllClose(expected_panoptic_prediction,
+                        panoptic_prediction)
+  def test_gets_panoptic_predictions_with_score(self):
+    batch = 1
+    height = 5
+    width = 5
+    classes = 3
+    semantic_logits = tf.random.uniform((batch, 1, 1, classes))
+    semantic_logits = tf.tile(semantic_logits, (1, height, width, 1))
+    center_heatmap = tf.convert_to_tensor([
+        [1.0, 0.0, 0.0, 0.0, 0.0],
+        [0.8, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0, 0.1, 0.7],
+        [0.0, 0.0, 0.0, 0.0, 0.2],
+    ], dtype=tf.float32)
+    center_heatmap = tf.expand_dims(center_heatmap, 0)
+    center_heatmap = tf.expand_dims(center_heatmap, 3)
+    center_offsets = tf.zeros((batch, height, width, 2))
+    center_threshold = 0.0
+    thing_class_ids = tf.range(classes)  # No "stuff" classes.
+    label_divisor = 256
+    stuff_area_limit = 16
+    void_label = classes
+    nms_kernel_size = 3
+    keep_k_centers = 2
+    result = panoptic_deeplab_merge._get_panoptic_predictions(
+        semantic_logits, center_heatmap, center_offsets, center_threshold,
+        thing_class_ids, label_divisor, stuff_area_limit, void_label,
+        nms_kernel_size, keep_k_centers)
+    instance_maps = result[3].numpy()
+    instance_scores = result[2].numpy()
+    self.assertSequenceEqual(instance_maps.shape, (batch, height, width))
+    expected_instances = [[
+        [1, 1, 1, 1, 2],
+        [1, 1, 1, 2, 2],
+        [1, 1, 2, 2, 2],
+        [1, 2, 2, 2, 2],
+        [1, 2, 2, 2, 2],
+    ]]
+    np.testing.assert_array_equal(instance_maps, expected_instances)
+    self.assertSequenceEqual(instance_scores.shape, (batch, height, width))
+    expected_instance_scores = [[
+        [1.0, 1.0, 1.0, 1.0, 0.7],
+        [1.0, 1.0, 1.0, 0.7, 0.7],
+        [1.0, 1.0, 0.7, 0.7, 0.7],
+        [1.0, 0.7, 0.7, 0.7, 0.7],
+        [1.0, 0.7, 0.7, 0.7, 0.7],
+    ]]
+    self.assertAllClose(result[2],
+                        tf.constant(expected_instance_scores))
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_deeplab_model.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_deeplab_model.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Build Panoptic Deeplab model."""
+from typing import Any, Mapping, Optional, Union
+import tensorflow as tf
+from official.vision.beta.projects.panoptic_maskrcnn.modeling.layers import panoptic_deeplab_merge
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class PanopticDeeplabModel(tf.keras.Model):
+  """Panoptic Deeplab model."""
+  def __init__(
+      self,
+      backbone: tf.keras.Model,
+      semantic_decoder: tf.keras.Model,
+      semantic_head: tf.keras.layers.Layer,
+      instance_head: tf.keras.layers.Layer,
+      instance_decoder: Optional[tf.keras.Model] = None,
+      post_processor: Optional[panoptic_deeplab_merge.PostProcessor] = None,
+      **kwargs):
+    """Panoptic deeplab model initializer.
+    Args:
+      backbone: a backbone network.
+      semantic_decoder: a decoder network. E.g. FPN.
+      semantic_head: segmentation head.
+      instance_head: instance center head.
+      instance_decoder: Optional decoder network for instance predictions.
+      post_processor: Optional post processor layer.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(PanopticDeeplabModel, self).__init__(**kwargs)
+    self._config_dict = {
+        'backbone': backbone,
+        'semantic_decoder': semantic_decoder,
+        'instance_decoder': instance_decoder,
+        'semantic_head': semantic_head,
+        'instance_head': instance_head,
+        'post_processor': post_processor
+    }
+    self.backbone = backbone
+    self.semantic_decoder = semantic_decoder
+    self.instance_decoder = instance_decoder
+    self.semantic_head = semantic_head
+    self.instance_head = instance_head
+    self.post_processor = post_processor
+  def call(
+      self, inputs: tf.Tensor,
+      image_info: tf.Tensor,
+      training: bool = None):
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+    backbone_features = self.backbone(inputs, training=training)
+    semantic_features = self.semantic_decoder(
+        backbone_features, training=training)
+    if self.instance_decoder is None:
+      instance_features = semantic_features
+    else:
+      instance_features = self.instance_decoder(
+          backbone_features, training=training)
+    segmentation_outputs = self.semantic_head(
+        (backbone_features, semantic_features),
+        training=training)
+    instance_outputs = self.instance_head(
+        (backbone_features, instance_features),
+        training=training)
+    outputs = {
+        'segmentation_outputs': segmentation_outputs,
+        'instance_centers_heatmap':
+            instance_outputs['instance_centers_heatmap'],
+        'instance_centers_offset':
+            instance_outputs['instance_centers_offset'],
+    }
+    if training:
+      return outputs
+    if self.post_processor is not None:
+      panoptic_masks = self.post_processor(outputs, image_info)
+      outputs.update(panoptic_masks)
+    return outputs
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(
+        backbone=self.backbone,
+        semantic_decoder=self.semantic_decoder,
+        semantic_head=self.semantic_head,
+        instance_head=self.instance_head)
+    if self.instance_decoder is not None:
+      items.update(instance_decoder=self.instance_decoder)
+    return items
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config_dict
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_deeplab_model_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_deeplab_model_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for Panoptic Deeplab network."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from official.vision.beta.projects.panoptic_maskrcnn.modeling import panoptic_deeplab_model
+from official.vision.beta.projects.panoptic_maskrcnn.modeling.heads import panoptic_deeplab_heads
+from official.vision.beta.projects.panoptic_maskrcnn.modeling.layers import panoptic_deeplab_merge
+from official.vision.modeling import backbones
+from official.vision.modeling.decoders import aspp
+class PanopticDeeplabNetworkTest(parameterized.TestCase, tf.test.TestCase):
+  @combinations.generate(
+      combinations.combine(
+          level=[2, 3, 4],
+          input_size=[256, 512],
+          low_level=[[4, 3], [3, 2]],
+          shared_decoder=[True, False],
+          training=[True, False]))
+  def test_panoptic_deeplab_network_creation(
+      self, input_size, level, low_level, shared_decoder, training):
+    """Test for creation of a panoptic deeplab network."""
+    batch_size = 2 if training else 1
+    num_classes = 10
+    inputs = np.random.rand(batch_size, input_size, input_size, 3)
+    image_info = tf.convert_to_tensor(
+        [[[input_size, input_size], [input_size, input_size], [1, 1], [0, 0]]])
+    image_info = tf.tile(image_info, [batch_size, 1, 1])
+    tf.keras.backend.set_image_data_format('channels_last')
+    backbone = backbones.ResNet(model_id=50)
+    semantic_decoder = aspp.ASPP(
+        level=level, dilation_rates=[6, 12, 18])
+    if shared_decoder:
+      instance_decoder = semantic_decoder
+    else:
+      instance_decoder = aspp.ASPP(
+          level=level, dilation_rates=[6, 12, 18])
+    semantic_head = panoptic_deeplab_heads.SemanticHead(
+        num_classes,
+        level=level,
+        low_level=low_level,
+        low_level_num_filters=(64, 32))
+    instance_head = panoptic_deeplab_heads.InstanceHead(
+        level=level,
+        low_level=low_level,
+        low_level_num_filters=(64, 32))
+    post_processor = panoptic_deeplab_merge.PostProcessor(
+        output_size=[input_size, input_size],
+        center_score_threshold=0.1,
+        thing_class_ids=[1, 2, 3, 4],
+        label_divisor=[256],
+        stuff_area_limit=4096,
+        ignore_label=0,
+        nms_kernel=41,
+        keep_k_centers=41,
+        rescale_predictions=True)
+    model = panoptic_deeplab_model.PanopticDeeplabModel(
+        backbone=backbone,
+        semantic_decoder=semantic_decoder,
+        instance_decoder=instance_decoder,
+        semantic_head=semantic_head,
+        instance_head=instance_head,
+        post_processor=post_processor)
+    outputs = model(
+        inputs=inputs,
+        image_info=image_info,
+        training=training)
+    if training:
+      self.assertIn('segmentation_outputs', outputs)
+      self.assertIn('instance_centers_heatmap', outputs)
+      self.assertIn('instance_centers_offset', outputs)
+      self.assertAllEqual(
+          [2, input_size // (2**low_level[-1]),
+           input_size //(2**low_level[-1]),
+           num_classes],
+          outputs['segmentation_outputs'].numpy().shape)
+      self.assertAllEqual(
+          [2, input_size // (2**low_level[-1]),
+           input_size // (2**low_level[-1]),
+           1],
+          outputs['instance_centers_heatmap'].numpy().shape)
+      self.assertAllEqual(
+          [2, input_size // (2**low_level[-1]),
+           input_size // (2**low_level[-1]),
+           2],
+          outputs['instance_centers_offset'].numpy().shape)
+    else:
+      self.assertIn('panoptic_outputs', outputs)
+      self.assertIn('category_mask', outputs)
+      self.assertIn('instance_mask', outputs)
+      self.assertIn('instance_centers', outputs)
+      self.assertIn('instance_scores', outputs)
+      self.assertIn('segmentation_outputs', outputs)
+  @combinations.generate(
+      combinations.combine(
+          level=[2, 3, 4],
+          low_level=[(4, 3), (3, 2)],
+          shared_decoder=[True, False]))
+  def test_serialize_deserialize(self, level, low_level, shared_decoder):
+    """Validate the network can be serialized and deserialized."""
+    num_classes = 10
+    backbone = backbones.ResNet(model_id=50)
+    semantic_decoder = aspp.ASPP(
+        level=level, dilation_rates=[6, 12, 18])
+    if shared_decoder:
+      instance_decoder = semantic_decoder
+    else:
+      instance_decoder = aspp.ASPP(
+          level=level, dilation_rates=[6, 12, 18])
+    semantic_head = panoptic_deeplab_heads.SemanticHead(
+        num_classes,
+        level=level,
+        low_level=low_level,
+        low_level_num_filters=(64, 32))
+    instance_head = panoptic_deeplab_heads.InstanceHead(
+        level=level,
+        low_level=low_level,
+        low_level_num_filters=(64, 32))
+    post_processor = panoptic_deeplab_merge.PostProcessor(
+        output_size=[640, 640],
+        center_score_threshold=0.1,
+        thing_class_ids=[1, 2, 3, 4],
+        label_divisor=[256],
+        stuff_area_limit=4096,
+        ignore_label=0,
+        nms_kernel=41,
+        keep_k_centers=41,
+        rescale_predictions=True)
+    model = panoptic_deeplab_model.PanopticDeeplabModel(
+        backbone=backbone,
+        semantic_decoder=semantic_decoder,
+        instance_decoder=instance_decoder,
+        semantic_head=semantic_head,
+        instance_head=instance_head,
+        post_processor=post_processor)
+    config = model.get_config()
+    new_model = panoptic_deeplab_model.PanopticDeeplabModel.from_config(config)
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/panoptic_maskrcnn/ops/mask_ops.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/ops/mask_ops.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for masks."""
+import tensorflow as tf
+def resize_and_rescale_offsets(input_tensor: tf.Tensor, target_size):
+  """Bilinearly resizes and rescales the offsets.
+    Reference:
+    https://github.com/google-research/deeplab2/blob/main/model/utils.py#L157
+  Args:
+    input_tensor: A tf.Tensor of shape [batch, height, width, 2].
+    target_size: A list or tuple or 1D tf.Tensor that specifies the height and
+      width after resizing.
+  Returns:
+    The input_tensor resized to shape `[batch, target_height, target_width, 2]`.
+      Moreover, the offsets along the y-axis are rescaled by a factor equal to
+      (target_height - 1) / (reference_height - 1) and the offsets along the
+      x-axis are rescaled by a factor equal to
+      (target_width - 1) / (reference_width - 1).
+  """
+  input_size_y = tf.shape(input_tensor)[1]
+  input_size_x = tf.shape(input_tensor)[2]
+  dtype = input_tensor.dtype
+  scale_y = tf.cast(target_size[0] - 1, dtype=dtype) / tf.cast(
+      input_size_y - 1, dtype=dtype)
+  scale_x = tf.cast(target_size[1] - 1, dtype=dtype) / tf.cast(
+      input_size_x - 1, dtype=dtype)
+  target_y, target_x = tf.split(
+      value=input_tensor, num_or_size_splits=2, axis=3)
+  target_y *= scale_y
+  target_x *= scale_x
+  _ = tf.concat([target_y, target_x], 3)
+  return tf.image.resize(
+      input_tensor,
+      size=target_size,
+      method=tf.image.ResizeMethod.BILINEAR)
--- a/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_deeplab.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_deeplab.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Panoptic Deeplab task definition."""
+from typing import Any, Dict, List, Mapping, Optional, Tuple
+from absl import logging
+import tensorflow as tf
+from official.common import dataset_fn
+from official.core import base_task
+from official.core import task_factory
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_deeplab as exp_cfg
+from official.vision.beta.projects.panoptic_maskrcnn.dataloaders import panoptic_deeplab_input
+from official.vision.beta.projects.panoptic_maskrcnn.losses import panoptic_deeplab_losses
+from official.vision.beta.projects.panoptic_maskrcnn.modeling import factory
+from official.vision.dataloaders import input_reader_factory
+from official.vision.evaluation import panoptic_quality_evaluator
+from official.vision.evaluation import segmentation_metrics
+@task_factory.register_task_cls(exp_cfg.PanopticDeeplabTask)
+class PanopticDeeplabTask(base_task.Task):
+  """A task for Panoptic Deeplab."""
+  def build_model(self):
+    """Builds panoptic deeplab model."""
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None] + self.task_config.model.input_size)
+    l2_weight_decay = self.task_config.losses.l2_weight_decay
+    # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss.
+    # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2)
+    # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss)
+    l2_regularizer = (tf.keras.regularizers.l2(
+        l2_weight_decay / 2.0) if l2_weight_decay else None)
+    model = factory.build_panoptic_deeplab(
+        input_specs=input_specs,
+        model_config=self.task_config.model,
+        l2_regularizer=l2_regularizer)
+    return model
+  def initialize(self, model: tf.keras.Model):
+    """Loads pretrained checkpoint."""
+    if not self.task_config.init_checkpoint:
+      return
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    # Restoring checkpoint.
+    if 'all' in self.task_config.init_checkpoint_modules:
+      ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+      status = ckpt.read(ckpt_dir_or_file)
+      status.expect_partial().assert_existing_objects_matched()
+    else:
+      ckpt_items = {}
+      if 'backbone' in self.task_config.init_checkpoint_modules:
+        ckpt_items.update(backbone=model.backbone)
+      if 'decoder' in self.task_config.init_checkpoint_modules:
+        ckpt_items.update(semantic_decoder=model.semantic_decoder)
+        if not self.task_config.model.shared_decoder:
+          ckpt_items.update(instance_decoder=model.instance_decoder)
+      ckpt = tf.train.Checkpoint(**ckpt_items)
+      status = ckpt.read(ckpt_dir_or_file)
+      status.expect_partial().assert_existing_objects_matched()
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
+  def build_inputs(self,
+                   params: exp_cfg.DataConfig,
+                   input_context: Optional[tf.distribute.InputContext] = None):
+    """Builds panoptic deeplab input."""
+    decoder_cfg = params.decoder.get()
+    if params.decoder.type == 'simple_decoder':
+      decoder = panoptic_deeplab_input.TfExampleDecoder(
+          regenerate_source_id=decoder_cfg.regenerate_source_id,
+          panoptic_category_mask_key=decoder_cfg.panoptic_category_mask_key,
+          panoptic_instance_mask_key=decoder_cfg.panoptic_instance_mask_key)
+    else:
+      raise ValueError('Unknown decoder type: {}!'.format(params.decoder.type))
+    parser = panoptic_deeplab_input.Parser(
+        output_size=self.task_config.model.input_size[:2],
+        ignore_label=params.parser.ignore_label,
+        resize_eval_groundtruth=params.parser.resize_eval_groundtruth,
+        groundtruth_padded_size=params.parser.groundtruth_padded_size,
+        aug_scale_min=params.parser.aug_scale_min,
+        aug_scale_max=params.parser.aug_scale_max,
+        aug_rand_hflip=params.parser.aug_rand_hflip,
+        aug_type=params.parser.aug_type,
+        sigma=params.parser.sigma,
+        dtype=params.parser.dtype)
+    reader = input_reader_factory.input_reader_generator(
+        params,
+        dataset_fn=dataset_fn.pick_dataset_fn(params.file_type),
+        decoder_fn=decoder.decode,
+        parser_fn=parser.parse_fn(params.is_training))
+    dataset = reader.read(input_context=input_context)
+    return dataset
+  def build_losses(self,
+                   labels: Mapping[str, tf.Tensor],
+                   model_outputs: Mapping[str, tf.Tensor],
+                   aux_losses: Optional[Any] = None):
+    """Panoptic deeplab losses.
+    Args:
+      labels: labels.
+      model_outputs: Output logits from panoptic deeplab.
+      aux_losses: auxiliarly loss tensors, i.e. `losses` in keras.Model.
+    Returns:
+      The total loss tensor.
+    """
+    loss_config = self._task_config.losses
+    segmentation_loss_fn = panoptic_deeplab_losses.WeightedBootstrappedCrossEntropyLoss(
+        loss_config.label_smoothing,
+        loss_config.class_weights,
+        loss_config.ignore_label,
+        top_k_percent_pixels=loss_config.top_k_percent_pixels)
+    instance_center_heatmap_loss_fn = panoptic_deeplab_losses.CenterHeatmapLoss(
+    )
+    instance_center_offset_loss_fn = panoptic_deeplab_losses.CenterOffsetLoss()
+    semantic_weights = tf.cast(
+        labels['semantic_weights'],
+        dtype=model_outputs['instance_centers_heatmap'].dtype)
+    things_mask = tf.cast(
+        tf.squeeze(labels['things_mask'], axis=3),
+        dtype=model_outputs['instance_centers_heatmap'].dtype)
+    valid_mask = tf.cast(
+        tf.squeeze(labels['valid_mask'], axis=3),
+        dtype=model_outputs['instance_centers_heatmap'].dtype)
+    segmentation_loss = segmentation_loss_fn(
+        model_outputs['segmentation_outputs'],
+        labels['category_mask'],
+        sample_weight=semantic_weights)
+    instance_center_heatmap_loss = instance_center_heatmap_loss_fn(
+        model_outputs['instance_centers_heatmap'],
+        labels['instance_centers_heatmap'],
+        sample_weight=valid_mask)
+    instance_center_offset_loss = instance_center_offset_loss_fn(
+        model_outputs['instance_centers_offset'],
+        labels['instance_centers_offset'],
+        sample_weight=things_mask)
+    model_loss = (
+        loss_config.segmentation_loss_weight * segmentation_loss +
+        loss_config.center_heatmap_loss_weight * instance_center_heatmap_loss +
+        loss_config.center_offset_loss_weight * instance_center_offset_loss)
+    total_loss = model_loss
+    if aux_losses:
+      total_loss += tf.add_n(aux_losses)
+    losses = {
+        'total_loss': total_loss,
+        'model_loss': model_loss,
+        'segmentation_loss': segmentation_loss,
+        'instance_center_heatmap_loss': instance_center_heatmap_loss,
+        'instance_center_offset_loss': instance_center_offset_loss
+    }
+    return losses
+  def build_metrics(self, training: bool = True) -> List[
+      tf.keras.metrics.Metric]:
+    """Build metrics."""
+    eval_config = self.task_config.evaluation
+    metrics = []
+    if training:
+      metric_names = [
+          'total_loss',
+          'segmentation_loss',
+          'instance_center_heatmap_loss',
+          'instance_center_offset_loss',
+          'model_loss']
+      for name in metric_names:
+        metrics.append(tf.keras.metrics.Mean(name, dtype=tf.float32))
+      if eval_config.report_train_mean_iou:
+        self.train_mean_iou = segmentation_metrics.MeanIoU(
+            name='train_mean_iou',
+            num_classes=self.task_config.model.num_classes,
+            rescale_predictions=False,
+            dtype=tf.float32)
+    else:
+      rescale_predictions = (not self.task_config.validation_data.parser
+                             .resize_eval_groundtruth)
+      self.perclass_iou_metric = segmentation_metrics.PerClassIoU(
+          name='per_class_iou',
+          num_classes=self.task_config.model.num_classes,
+          rescale_predictions=rescale_predictions,
+          dtype=tf.float32)
+      if isinstance(tf.distribute.get_strategy(), tf.distribute.TPUStrategy):
+        self._process_iou_metric_on_cpu = True
+      else:
+        self._process_iou_metric_on_cpu = False
+      if self.task_config.model.generate_panoptic_masks:
+        self.panoptic_quality_metric = panoptic_quality_evaluator.PanopticQualityEvaluator(
+            num_categories=self.task_config.model.num_classes,
+            ignored_label=eval_config.ignored_label,
+            max_instances_per_category=eval_config.max_instances_per_category,
+            offset=eval_config.offset,
+            is_thing=eval_config.is_thing,
+            rescale_predictions=eval_config.rescale_predictions)
+    # Update state on CPU if TPUStrategy due to dynamic resizing.
+    self._process_iou_metric_on_cpu = isinstance(
+        tf.distribute.get_strategy(),
+        tf.distribute.TPUStrategy)
+    return metrics
+  def train_step(
+      self,
+      inputs: Tuple[Any, Any],
+      model: tf.keras.Model,
+      optimizer: tf.keras.optimizers.Optimizer,
+      metrics: Optional[List[Any]] = None) -> Dict[str, Any]:
+    """Does forward and backward.
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+    Returns:
+      A dictionary of logs.
+    """
+    images, labels = inputs
+    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
+    with tf.GradientTape() as tape:
+      outputs = model(
+          inputs=images,
+          image_info=labels['image_info'],
+          training=True)
+      outputs = tf.nest.map_structure(
+          lambda x: tf.cast(x, tf.float32), outputs)
+      # Computes per-replica loss.
+      losses = self.build_losses(
+          labels=labels,
+          model_outputs=outputs,
+          aux_losses=model.losses)
+      scaled_loss = losses['total_loss'] / num_replicas
+      # For mixed_precision policy, when LossScaleOptimizer is used, loss is
+      # scaled for numerical stability.
+      if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    # Scales back gradient when LossScaleOptimizer is used.
+    if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+      grads = optimizer.get_unscaled_gradients(grads)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    logs = {self.loss: losses['total_loss']}
+    if metrics:
+      for m in metrics:
+        m.update_state(losses[m.name])
+    if self.task_config.evaluation.report_train_mean_iou:
+      segmentation_labels = {
+          'masks': labels['category_mask'],
+          'valid_masks': labels['valid_mask'],
+          'image_info': labels['image_info']
+      }
+      self.process_metrics(
+          metrics=[self.train_mean_iou],
+          labels=segmentation_labels,
+          model_outputs=outputs['segmentation_outputs'])
+      logs.update({
+          self.train_mean_iou.name:
+              self.train_mean_iou.result()
+      })
+    return logs
+  def validation_step(
+      self,
+      inputs: Tuple[Any, Any],
+      model: tf.keras.Model,
+      metrics: Optional[List[Any]] = None) -> Dict[str, Any]:
+    """Validatation step.
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+    Returns:
+      A dictionary of logs.
+    """
+    images, labels = inputs
+    outputs = model(
+        inputs=images,
+        image_info=labels['image_info'],
+        training=False)
+    logs = {self.loss: 0}
+    segmentation_labels = {
+        'masks': labels['category_mask'],
+        'valid_masks': labels['valid_mask'],
+        'image_info': labels['image_info']
+    }
+    if self._process_iou_metric_on_cpu:
+      logs.update({
+          self.perclass_iou_metric.name:
+              (segmentation_labels, outputs['segmentation_outputs'])
+      })
+    else:
+      self.perclass_iou_metric.update_state(
+          segmentation_labels,
+          outputs['segmentation_outputs'])
+    if self.task_config.model.generate_panoptic_masks:
+      pq_metric_labels = {
+          'category_mask':
+              tf.squeeze(labels['category_mask'], axis=3),
+          'instance_mask':
+              tf.squeeze(labels['instance_mask'], axis=3),
+          'image_info': labels['image_info']
+      }
+      panoptic_outputs = {
+          'category_mask':
+              outputs['category_mask'],
+          'instance_mask':
+              outputs['instance_mask'],
+      }
+      logs.update({
+          self.panoptic_quality_metric.name:
+              (pq_metric_labels, panoptic_outputs)})
+    return logs
+  def aggregate_logs(self, state=None, step_outputs=None):
+    if state is None:
+      self.perclass_iou_metric.reset_states()
+      state = [self.perclass_iou_metric]
+      if self.task_config.model.generate_panoptic_masks:
+        state += [self.panoptic_quality_metric]
+    if self._process_iou_metric_on_cpu:
+      self.perclass_iou_metric.update_state(
+          step_outputs[self.perclass_iou_metric.name][0],
+          step_outputs[self.perclass_iou_metric.name][1])
+    if self.task_config.model.generate_panoptic_masks:
+      self.panoptic_quality_metric.update_state(
+          step_outputs[self.panoptic_quality_metric.name][0],
+          step_outputs[self.panoptic_quality_metric.name][1])
+    return state
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+    result = {}
+    ious = self.perclass_iou_metric.result()
+    if self.task_config.evaluation.report_per_class_iou:
+      for i, value in enumerate(ious.numpy()):
+        result.update({'segmentation_iou/class_{}'.format(i): value})
+    # Computes mean IoU
+    result.update({'segmentation_mean_iou': tf.reduce_mean(ious).numpy()})
+    if self.task_config.model.generate_panoptic_masks:
+      panoptic_quality_results = self.panoptic_quality_metric.result()
+      for k, value in panoptic_quality_results.items():
+        if k.endswith('per_class'):
+          if self.task_config.evaluation.report_per_class_pq:
+            for i, per_class_value in enumerate(value):
+              metric_key = 'panoptic_quality/{}/class_{}'.format(k, i)
+              result[metric_key] = per_class_value
+          else:
+            continue
+        else:
+          result['panoptic_quality/{}'.format(k)] = value
+    return result
--- a/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_deeplab_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_deeplab_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for panoptic_deeplab.py."""
+import os
+from absl.testing import parameterized
+import tensorflow as tf
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_deeplab as cfg
+from official.vision.beta.projects.panoptic_maskrcnn.tasks import panoptic_deeplab
+# TODO(b/234636381): add unit test for train and validation step
+class PanopticDeeplabTaskTest(tf.test.TestCase, parameterized.TestCase):
+  @parameterized.parameters(
+      (['all'], False),
+      (['backbone'], False),
+      (['decoder'], False),
+      (['decoder'], True))
+  def test_model_initializing(self, init_checkpoint_modules, shared_decoder):
+    task_config = cfg.PanopticDeeplabTask(
+        model=cfg.PanopticDeeplab(
+            num_classes=10,
+            input_size=[640, 640, 3],
+            shared_decoder=shared_decoder))
+    task = panoptic_deeplab.PanopticDeeplabTask(task_config)
+    model = task.build_model()
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    ckpt_save_dir = self.create_tempdir().full_path
+    ckpt.save(os.path.join(ckpt_save_dir, 'ckpt'))
+    task._task_config.init_checkpoint = ckpt_save_dir
+    task._task_config.init_checkpoint_modules = init_checkpoint_modules
+    task.initialize(model)
+  @parameterized.parameters(
+      (True,),
+      (False,))
+  def test_build_metrics(self, training):
+    task_config = cfg.PanopticDeeplabTask(
+        model=cfg.PanopticDeeplab(
+            num_classes=10,
+            input_size=[640, 640, 3],
+            shared_decoder=False))
+    task = panoptic_deeplab.PanopticDeeplabTask(task_config)
+    metrics = task.build_metrics(training=training)
+    if training:
+      expected_metric_names = {
+          'total_loss',
+          'segmentation_loss',
+          'instance_center_heatmap_loss',
+          'instance_center_offset_loss',
+          'model_loss'}
+      self.assertEqual(
+          expected_metric_names,
+          set([metric.name for metric in metrics]))
+    else:
+      assert hasattr(task, 'perclass_iou_metric')
+      assert hasattr(task, 'panoptic_quality_metric')
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/panoptic_maskrcnn/train.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/train.py
@@ -18,9 +18,12 @@ from absl import app
 from official.common import flags as tfm_flags
 from official.vision import train
-from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn as cfg  # pylint: disable=unused-import
+# pylint: disable=unused-import
-from official.vision.beta.projects.panoptic_maskrcnn.tasks import panoptic_maskrcnn as task  # pylint: disable=unused-import
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_deeplab
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn
+from official.vision.beta.projects.panoptic_maskrcnn.tasks import panoptic_deeplab as panoptic_deeplab_task
+from official.vision.beta.projects.panoptic_maskrcnn.tasks import panoptic_maskrcnn as panoptic_maskrcnn_task
+# pylint: enable=unused-import
 if __name__ == '__main__':
  tfm_flags.define_flags()

--- a/official/vision/ops/augment.py
+++ b/official/vision/ops/augment.py
@@ -1583,6 +1583,7 @@ class AutoAugment(ImageAugment):
        'reduced_cifar10': self.policy_reduced_cifar10(),
        'svhn': self.policy_svhn(),
        'reduced_imagenet': self.policy_reduced_imagenet(),
+        'panoptic_deeplab_policy': self.panoptic_deeplab_policy(),
    }
    if not policies:
@@ -1888,6 +1889,16 @@ class AutoAugment(ImageAugment):
    ]
    return policy
+  @staticmethod
+  def panoptic_deeplab_policy():
+    policy = [
+        [('Sharpness', 0.4, 1.4), ('Brightness', 0.2, 2.0)],
+        [('Equalize', 0.0, 1.8), ('Contrast', 0.2, 2.0)],
+        [('Sharpness', 0.2, 1.8), ('Color', 0.2, 1.8)],
+        [('Solarize', 0.2, 1.4), ('Equalize', 0.6, 1.8)],
+        [('Sharpness', 0.2, 0.2), ('Equalize', 0.2, 1.4)]]
+    return policy
  @staticmethod
  def policy_test():
    """Autoaugment test policy for debugging."""
@@ -2025,7 +2036,7 @@ class RandAugment(ImageAugment):
      aug_image, aug_bboxes = tf.switch_case(
          branch_index=op_to_select,
          branch_fns=branch_fns,
-          default=lambda: (tf.identity(image), _maybe_identity(bboxes)))
+          default=lambda: (tf.identity(image), _maybe_identity(bboxes)))  # pylint: disable=cell-var-from-loop
      if self.prob_to_apply is not None:
        aug_image, aug_bboxes = tf.cond(