Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

c127d527 · Srihari Humbarwadi · GitHub · 78657911 · 457bcb85 · c127d527
Unverified Commit c127d527 authored Feb 04, 2022 by Srihari Humbarwadi Committed by GitHub Feb 04, 2022
20 changed files
--- a/official/projects/movinet/modeling/movinet_layers_test.py
+++ b/official/projects/movinet/modeling/movinet_layers_test.py
@@ -64,6 +64,72 @@ class MovinetLayersTest(parameterized.TestCase, tf.test.TestCase):
    self.assertEqual(predicted.shape, expected.shape)
    self.assertAllClose(predicted, expected)
+  def test_mobile_conv2d_bn(self):
+    batch_norm_op = tf.keras.layers.BatchNormalization(
+        momentum=0.9,
+        epsilon=1.,
+        name='bn')
+    conv2d = movinet_layers.MobileConv2D(
+        filters=3,
+        kernel_size=(3, 3),
+        strides=(1, 1),
+        padding='same',
+        kernel_initializer='ones',
+        use_bias=False,
+        use_depthwise=False,
+        use_temporal=False,
+        use_buffered_input=True,
+        batch_norm_op=batch_norm_op,
+    )
+    inputs = tf.ones([1, 2, 2, 2, 3])
+    predicted = conv2d(inputs)
+    expected = tf.constant(
+        [[[[[8.48528, 8.48528, 8.48528],
+            [8.48528, 8.48528, 8.48528]],
+           [[8.48528, 8.48528, 8.48528],
+            [8.48528, 8.48528, 8.48528]]],
+          [[[8.48528, 8.48528, 8.48528],
+            [8.48528, 8.48528, 8.48528]],
+           [[8.48528, 8.48528, 8.48528],
+            [8.48528, 8.48528, 8.48528]]]]])
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
+  def test_mobile_conv2d_activation(self):
+    conv2d = movinet_layers.MobileConv2D(
+        filters=3,
+        kernel_size=(3, 3),
+        strides=(1, 1),
+        padding='same',
+        kernel_initializer='ones',
+        use_bias=False,
+        use_depthwise=False,
+        use_temporal=False,
+        use_buffered_input=True,
+        activation_op=tf.nn.relu6,
+    )
+    inputs = tf.ones([1, 2, 2, 2, 3])
+    predicted = conv2d(inputs)
+    expected = tf.constant(
+        [[[[[6., 6., 6.],
+            [6., 6., 6.]],
+           [[6., 6., 6.],
+            [6., 6., 6.]]],
+          [[[6., 6., 6.],
+            [6., 6., 6.]],
+           [[6., 6., 6.],
+            [6., 6., 6.]]]]])
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected)
  def test_mobile_conv2d_temporal(self):
    conv2d = movinet_layers.MobileConv2D(
        filters=3,
@@ -378,6 +444,35 @@ class MovinetLayersTest(parameterized.TestCase, tf.test.TestCase):
      self.assertEqual(predicted.shape, expected.shape)
      self.assertAllClose(predicted, expected)
+  def test_stream_movinet_block_none_se(self):
+    block = movinet_layers.MovinetBlock(
+        out_filters=3,
+        expand_filters=6,
+        kernel_size=(3, 3, 3),
+        strides=(1, 2, 2),
+        causal=True,
+        se_type='none',
+        state_prefix='test',
+    )
+    inputs = tf.range(4, dtype=tf.float32) + 1.
+    inputs = tf.reshape(inputs, [1, 4, 1, 1, 1])
+    inputs = tf.tile(inputs, [1, 1, 2, 1, 3])
+    expected, expected_states = block(inputs)
+    for num_splits in [1, 2, 4]:
+      frames = tf.split(inputs, inputs.shape[1] // num_splits, axis=1)
+      states = {}
+      predicted = []
+      for frame in frames:
+        x, states = block(frame, states=states)
+        predicted.append(x)
+      predicted = tf.concat(predicted, axis=1)
+      self.assertEqual(predicted.shape, expected.shape)
+      self.assertAllClose(predicted, expected)
+    self.assertAllEqual(list(expected_states.keys()), ['test_stream_buffer'])
  def test_stream_classifier_head(self):
    head = movinet_layers.Head(project_filters=5)
    classifier_head = movinet_layers.ClassifierHead(

--- a/official/projects/movinet/modeling/movinet_test.py
+++ b/official/projects/movinet/modeling/movinet_test.py
@@ -99,6 +99,49 @@ class MoViNetTest(parameterized.TestCase, tf.test.TestCase):
    self.assertEqual(predicted.shape, expected.shape)
    self.assertAllClose(predicted, expected, 1e-5, 1e-5)
+  def test_movinet_stream_nse(self):
+    """Test if the backbone can be run in streaming mode w/o SE layer."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    backbone = movinet.Movinet(
+        model_id='a0',
+        causal=True,
+        use_external_states=True,
+        se_type='none',
+    )
+    inputs = tf.ones([1, 5, 128, 128, 3])
+    init_states = backbone.init_states(tf.shape(inputs))
+    expected_endpoints, _ = backbone({**init_states, 'image': inputs})
+    frames = tf.split(inputs, inputs.shape[1], axis=1)
+    states = init_states
+    for frame in frames:
+      output, states = backbone({**states, 'image': frame})
+    predicted_endpoints = output
+    predicted = predicted_endpoints['head']
+    # The expected final output is simply the mean across frames
+    expected = expected_endpoints['head']
+    expected = tf.reduce_mean(expected, 1, keepdims=True)
+    self.assertEqual(predicted.shape, expected.shape)
+    self.assertAllClose(predicted, expected, 1e-5, 1e-5)
+    # Check contents in the states dictionary.
+    state_keys = list(init_states.keys())
+    self.assertIn('state_head_pool_buffer', state_keys)
+    self.assertIn('state_head_pool_frame_count', state_keys)
+    state_keys.remove('state_head_pool_buffer')
+    state_keys.remove('state_head_pool_frame_count')
+    # From now on, there are only 'stream_buffer' for the convolutions.
+    for state_key in state_keys:
+      self.assertIn(
+          'stream_buffer', state_key,
+          msg=f'Expecting stream_buffer only, found {state_key}')
  def test_movinet_2plus1d_stream(self):
    tf.keras.backend.set_image_data_format('channels_last')

--- a/official/projects/movinet/export_saved_model.py
+++ b/official/projects/movinet/export_saved_model.py
@@ -82,6 +82,9 @@ flags.DEFINE_string(
 flags.DEFINE_string(
    'activation', 'swish',
    'The main activation to use across layers.')
+flags.DEFINE_string(
+    'classifier_activation', 'swish',
+    'The classifier activation to use.')
 flags.DEFINE_string(
    'gating_activation', 'sigmoid',
    'The gating activation to use in squeeze-excitation layers.')
@@ -124,11 +127,15 @@ def main(_) -> None:
  # states. These dimensions can be set to `None` once the model is built.
  input_shape = [1 if s is None else s for s in input_specs.shape]
+  # Override swish activation implementation to remove custom gradients
  activation = FLAGS.activation
  if activation == 'swish':
-    # Override swish activation implementation to remove custom gradients
    activation = 'simple_swish'
+  classifier_activation = FLAGS.classifier_activation
+  if classifier_activation == 'swish':
+    classifier_activation = 'simple_swish'
  backbone = movinet.Movinet(
      model_id=FLAGS.model_id,
      causal=FLAGS.causal,
@@ -145,9 +152,7 @@ def main(_) -> None:
      num_classes=FLAGS.num_classes,
      output_states=FLAGS.causal,
      input_specs=dict(image=input_specs),
-      # TODO(dankondratyuk): currently set to swish, but will need to
+      activation=classifier_activation)
-      # re-train to use other activations.
-      activation='simple_swish')
  model.build(input_shape)
  # Compile model to generate some internal Keras variables.

--- a/official/projects/movinet/export_saved_model_test.py
+++ b/official/projects/movinet/export_saved_model_test.py
@@ -18,7 +18,7 @@ from absl import flags
 import tensorflow as tf
 import tensorflow_hub as hub
-from official.projects.movinet import export_saved_model
+from official.projects.movinet.tools import export_saved_model
 FLAGS = flags.FLAGS

--- a/official/projects/vit/modeling/vit.py
+++ b/official/projects/vit/modeling/vit.py
@@ -145,7 +145,7 @@ class Encoder(tf.keras.layers.Layer):
    self._encoder_layers = []
    # Set layer norm epsilons to 1e-6 to be consistent with JAX implementation.
-    # https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.LayerNorm.html
+    # https://flax.readthedocs.io/en/latest/_autosummary/flax.deprecated.nn.LayerNorm.html
    for i in range(self._num_layers):
      encoder_layer = nn_blocks.TransformerEncoderBlock(
          inner_activation=activations.gelu,

--- a/official/vision/__init__.py
+++ b/official/vision/__init__.py
@@ -12,3 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Vision package definition."""
+# Lint as: python3
+# pylint: disable=unused-import
+from official.vision.beta import configs
+from official.vision.beta import tasks
--- a/official/vision/beta/MODEL_GARDEN.md
+++ b/official/vision/beta/MODEL_GARDEN.md
@@ -55,6 +55,20 @@ depth, label smoothing and dropout.
 | ResNet-RS-350 | 256x256    |  164.3   | 83.7  | 96.7  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i256.tar.gz) |
 | ResNet-RS-350 | 320x320    | 164.3   | 84.2  | 96.9  | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i320.tar.gz) |
+#### Vision Transformer (ViT)
+We support [ViT](https://arxiv.org/abs/2010.11929) and [DEIT](https://arxiv.org/abs/2012.12877) implementations in a TF
+Vision
+[project](https://github.com/tensorflow/models/tree/master/official/projects/vit). ViT models trained under the DEIT settings:
+model     | resolution | Top-1 | Top-5 |
+--------- | :--------: | ----: | ----: |
+ViT-s16  | 224x224    | 79.4  | 94.7  |
+ViT-b16  | 224x224    | 81.8  | 95.8  |
+ViT-l16  | 224x224    | 82.2  | 95.8  |
 ## Object Detection and Instance Segmentation
 ### Common Settings and Notes
@@ -123,6 +137,7 @@ evaluated on [COCO](https://cocodataset.org/) val2017.
 | Backbone     | Resolution | Epochs | Params (M) | Box AP | Mask AP | Download
 ------------ | :--------: | -----: | ---------: | -----: | ------: | -------:
 | SpineNet-49  | 640x640    | 500    | 56.4       | 46.4   | 40.0    | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_cascadercnn_tpu.yaml)|
+| SpineNet-96 | 1024x1024  | 500    | 70.8   | 50.9   | 43.8    | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_cascadercnn_tpu.yaml)|
 | SpineNet-143 | 1280x1280  | 500    | 94.9       | 51.9   | 45.0    | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_cascadercnn_tpu.yaml)|
 ## Semantic Segmentation

--- a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3small_tpu.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3small_tpu.yaml
+# MobileNetV3Small ImageNet classification. 67.5% top-1 and 87.6% top-5 accuracy.
+runtime:
+  distribution_strategy: 'tpu'
+  mixed_precision_dtype: 'bfloat16'
+task:
+  model:
+    num_classes: 1001
+    input_size: [224, 224, 3]
+    backbone:
+      type: 'mobilenet'
+      mobilenet:
+        model_id: 'MobileNetV3Small'
+        filter_size_scale: 1.0
+    norm_activation:
+      activation: 'relu'
+      norm_momentum: 0.997
+      norm_epsilon: 0.001
+      use_sync_bn: false
+    dropout_rate: 0.2
+  losses:
+    l2_weight_decay: 0.00001
+    one_hot: true
+    label_smoothing: 0.1
+  train_data:
+    input_path: 'imagenet-2012-tfrecord/train*'
+    is_training: true
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+  validation_data:
+    input_path: 'imagenet-2012-tfrecord/valid*'
+    is_training: false
+    global_batch_size: 4096
+    dtype: 'bfloat16'
+    drop_remainder: false
+trainer:
+  train_steps: 312000  # 1000 epochs
+  validation_steps: 12
+  validation_interval: 312
+  steps_per_loop: 312  # NUM_EXAMPLES (1281167) // global_batch_size
+  summary_interval: 312
+  checkpoint_interval: 312
+  optimizer_config:
+    optimizer:
+      type: 'rmsprop'
+      rmsprop:
+        rho: 0.9
+        momentum: 0.9
+        epsilon: 0.002
+    learning_rate:
+      type: 'exponential'
+      exponential:
+        initial_learning_rate: 0.01
+        decay_steps: 936  # 3 * steps_per_epoch
+        decay_rate: 0.99
+        staircase: true
+    ema:
+      average_decay: 0.9999
+      trainable_weights_only: false
+    warmup:
+      type: 'linear'
+      linear:
+        warmup_steps: 1560
+        warmup_learning_rate: 0.001
--- a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_casrcnn_tpu.yaml
+++ b/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_casrcnn_tpu.yaml
+# --experiment_type=cascadercnn_spinenet_coco
+# Expect to reach: box mAP: 51.9%, mask mAP: 45.0% on COCO
 runtime:
  distribution_strategy: 'tpu'
  mixed_precision_dtype: 'bfloat16'
@@ -8,12 +10,12 @@ task:
    parser:
      aug_rand_hflip: true
      aug_scale_min: 0.1
-      aug_scale_max: 2.0
+      aug_scale_max: 2.5
  losses:
    l2_weight_decay: 0.00004
  model:
    anchor:
-      anchor_size: 3.0
+      anchor_size: 4.0
      num_scales: 3
    min_level: 3
    max_level: 7

--- a/official/vision/beta/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/modeling/layers/nn_blocks.py
@@ -714,7 +714,8 @@ class InvertedBottleneckBlock(tf.keras.layers.Layer):
        'use_depthwise': self._use_depthwise,
        'use_residual': self._use_residual,
        'norm_momentum': self._norm_momentum,
-        'norm_epsilon': self._norm_epsilon
+        'norm_epsilon': self._norm_epsilon,
+        'output_intermediate_endpoints': self._output_intermediate_endpoints
    }
    base_config = super(InvertedBottleneckBlock, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

--- a/official/vision/beta/ops/augment.py
+++ b/official/vision/beta/ops/augment.py
@@ -2284,8 +2284,9 @@ class MixupAndCutmix:
        lambda x: _fill_rectangle(*x),
        (images, random_center_width, random_center_height, cut_width // 2,
         cut_height // 2, tf.reverse(images, [0])),
-        dtype=(tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.float32),
+        dtype=(
-        fn_output_signature=tf.TensorSpec(images.shape[1:], dtype=tf.float32))
+            images.dtype, tf.int32, tf.int32, tf.int32, tf.int32, images.dtype),
+        fn_output_signature=tf.TensorSpec(images.shape[1:], dtype=images.dtype))
    return images, labels, lam
@@ -2294,7 +2295,8 @@ class MixupAndCutmix:
    lam = MixupAndCutmix._sample_from_beta(self.mixup_alpha, self.mixup_alpha,
                                           labels.shape)
    lam = tf.reshape(lam, [-1, 1, 1, 1])
-    images = lam * images + (1. - lam) * tf.reverse(images, [0])
+    lam_cast = tf.cast(lam, dtype=images.dtype)
+    images = lam_cast * images + (1. - lam_cast) * tf.reverse(images, [0])
    return images, labels, tf.squeeze(lam)

--- a/official/vision/beta/ops/augment_test.py
+++ b/official/vision/beta/ops/augment_test.py
@@ -366,14 +366,19 @@ class RandomErasingTest(tf.test.TestCase, parameterized.TestCase):
    self.assertNotEqual(0, tf.reduce_max(aug_image))
-class MixupAndCutmixTest(tf.test.TestCase, parameterized.TestCase):
+@parameterized.named_parameters([
+    ('float16_images', tf.float16),
-  def test_mixup_and_cutmix_smoothes_labels(self):
+    ('bfloat16_images', tf.bfloat16),
+    ('float32_images', tf.float32),
+])
+class MixupAndCutmixTest(parameterized.TestCase, tf.test.TestCase):
+  def test_mixup_and_cutmix_smoothes_labels(self, image_dtype):
    batch_size = 12
    num_classes = 1000
    label_smoothing = 0.1
-    images = tf.random.normal((batch_size, 224, 224, 3), dtype=tf.float32)
+    images = tf.random.normal((batch_size, 224, 224, 3), dtype=image_dtype)
    labels = tf.range(batch_size)
    augmenter = augment.MixupAndCutmix(
        num_classes=num_classes, label_smoothing=label_smoothing)
@@ -388,12 +393,12 @@ class MixupAndCutmixTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes -
                               1e4)  # With tolerance
-  def test_mixup_changes_image(self):
+  def test_mixup_changes_image(self, image_dtype):
    batch_size = 12
    num_classes = 1000
    label_smoothing = 0.1
-    images = tf.random.normal((batch_size, 224, 224, 3), dtype=tf.float32)
+    images = tf.random.normal((batch_size, 224, 224, 3), dtype=image_dtype)
    labels = tf.range(batch_size)
    augmenter = augment.MixupAndCutmix(
        mixup_alpha=1., cutmix_alpha=0., num_classes=num_classes)
@@ -409,12 +414,12 @@ class MixupAndCutmixTest(tf.test.TestCase, parameterized.TestCase):
                               1e4)  # With tolerance
    self.assertFalse(tf.math.reduce_all(images == aug_images))
-  def test_cutmix_changes_image(self):
+  def test_cutmix_changes_image(self, image_dtype):
    batch_size = 12
    num_classes = 1000
    label_smoothing = 0.1
-    images = tf.random.normal((batch_size, 224, 224, 3), dtype=tf.float32)
+    images = tf.random.normal((batch_size, 224, 224, 3), dtype=image_dtype)
    labels = tf.range(batch_size)
    augmenter = augment.MixupAndCutmix(
        mixup_alpha=0., cutmix_alpha=1., num_classes=num_classes)

--- a/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn.py
@@ -25,6 +25,7 @@ from official.modeling import optimization
 from official.vision.beta.configs import common
 from official.vision.beta.configs import maskrcnn
 from official.vision.beta.configs import semantic_segmentation
+from official.vision.beta.projects.deepmac_maskrcnn.configs import deep_mask_head_rcnn as deepmac_maskrcnn
 SEGMENTATION_MODEL = semantic_segmentation.SemanticSegmentationModel
@@ -89,7 +90,7 @@ class PanopticSegmentationGenerator(hyperparams.Config):
 @dataclasses.dataclass
-class PanopticMaskRCNN(maskrcnn.MaskRCNN):
+class PanopticMaskRCNN(deepmac_maskrcnn.DeepMaskHeadRCNN):
  """Panoptic Mask R-CNN model config."""
  segmentation_model: semantic_segmentation.SemanticSegmentationModel = (
      SEGMENTATION_MODEL(num_classes=2))

--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory.py
@@ -17,10 +17,10 @@
 import tensorflow as tf
 from official.vision.beta.modeling import backbones
-from official.vision.beta.modeling import factory as models_factory
 from official.vision.beta.modeling.decoders import factory as decoder_factory
 from official.vision.beta.modeling.heads import segmentation_heads
 from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_deeplab as panoptic_deeplab_cfg
+from official.vision.beta.projects.deepmac_maskrcnn.tasks import deep_mask_head_rcnn
 from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn as panoptic_maskrcnn_cfg
 from official.vision.beta.projects.panoptic_maskrcnn.modeling import panoptic_deeplab_model
 from official.vision.beta.projects.panoptic_maskrcnn.modeling.heads import panoptic_deeplab_heads
@@ -50,7 +50,7 @@ def build_panoptic_maskrcnn(
  segmentation_config = model_config.segmentation_model
  # Builds the maskrcnn model.
-  maskrcnn_model = models_factory.build_maskrcnn(
+  maskrcnn_model = deep_mask_head_rcnn.build_maskrcnn(
      input_specs=input_specs,
      model_config=model_config,
      l2_regularizer=l2_regularizer)
@@ -120,6 +120,7 @@ def build_panoptic_maskrcnn(
  # Combines maskrcnn, and segmentation models to build panoptic segmentation
  # model.
  model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
      backbone=maskrcnn_model.backbone,
      decoder=maskrcnn_model.decoder,

--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model.py
@@ -18,10 +18,10 @@ from typing import List, Mapping, Optional, Union
 import tensorflow as tf
-from official.vision.beta.modeling import maskrcnn_model
+from official.vision.beta.projects.deepmac_maskrcnn.modeling import maskrcnn_model
-class PanopticMaskRCNNModel(maskrcnn_model.MaskRCNNModel):
+class PanopticMaskRCNNModel(maskrcnn_model.DeepMaskRCNNModel):
  """The Panoptic Segmentation model."""
  def __init__(
@@ -49,7 +49,8 @@ class PanopticMaskRCNNModel(maskrcnn_model.MaskRCNNModel):
      max_level: Optional[int] = None,
      num_scales: Optional[int] = None,
      aspect_ratios: Optional[List[float]] = None,
-      anchor_size: Optional[float] = None,  # pytype: disable=annotation-type-mismatch  # typed-keras
+      anchor_size: Optional[float] = None,
+      use_gt_boxes_for_masks: bool = False,  # pytype: disable=annotation-type-mismatch  # typed-keras
      **kwargs):
    """Initializes the Panoptic Mask R-CNN model.
@@ -94,6 +95,7 @@ class PanopticMaskRCNNModel(maskrcnn_model.MaskRCNNModel):
        aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
      anchor_size: A number representing the scale of size of the base anchor to
        the feature stride 2^level.
+      use_gt_boxes_for_masks: `bool`, whether to use only gt boxes for masks.
      **kwargs: keyword arguments to be passed.
    """
    super(PanopticMaskRCNNModel, self).__init__(
@@ -115,6 +117,7 @@ class PanopticMaskRCNNModel(maskrcnn_model.MaskRCNNModel):
        num_scales=num_scales,
        aspect_ratios=aspect_ratios,
        anchor_size=anchor_size,
+        use_gt_boxes_for_masks=use_gt_boxes_for_masks,
        **kwargs)
    self._config_dict.update({

--- a/official/vision/beta/projects/panoptic_maskrcnn/serving/panoptic_segmentation.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/serving/panoptic_segmentation.py
@@ -97,6 +97,20 @@ class PanopticSegmentationModule(detection.DetectionModule):
        anchor_boxes=anchor_boxes,
        training=False)
+    detections.pop('rpn_boxes')
+    detections.pop('rpn_scores')
+    detections.pop('cls_outputs')
+    detections.pop('box_outputs')
+    detections.pop('backbone_features')
+    detections.pop('decoder_features')
+    # Normalize detection boxes to [0, 1]. Here we first map them to the
+    # original image size, then normalize them to [0, 1].
+    detections['detection_boxes'] = (
+        detections['detection_boxes'] /
+        tf.tile(image_info[:, 2:3, :], [1, 1, 2]) /
+        tf.tile(image_info[:, 0:1, :], [1, 1, 2]))
    if model_params.detection_generator.apply_nms:
      final_outputs = {
          'detection_boxes': detections['detection_boxes'],
@@ -109,10 +123,15 @@ class PanopticSegmentationModule(detection.DetectionModule):
          'decoded_boxes': detections['decoded_boxes'],
          'decoded_box_scores': detections['decoded_box_scores']
      }
+    masks = detections['segmentation_outputs']
+    masks = tf.image.resize(masks, self._input_image_size, method='bilinear')
+    classes = tf.math.argmax(masks, axis=-1)
+    scores = tf.nn.softmax(masks, axis=-1)
    final_outputs.update({
        'detection_masks': detections['detection_masks'],
-        'segmentation_outputs': detections['segmentation_outputs'],
+        'masks': masks,
+        'scores': scores,
+        'classes': classes,
        'image_info': image_info
    })
    if model_params.generate_panoptic_masks:

--- a/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_maskrcnn.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_maskrcnn.py
@@ -61,7 +61,7 @@ class PanopticMaskRCNNTask(maskrcnn.MaskRCNNTask):
  def initialize(self, model: tf.keras.Model) -> None:
    """Loading pretrained checkpoint."""
-    if not self.task_config.init_checkpoint_modules:
+    if not self.task_config.init_checkpoint:
      return
    def _get_checkpoint_path(checkpoint_dir_or_file):

--- a/official/vision/utils/object_detection/visualization_utils.py
+++ b/official/vision/utils/object_detection/visualization_utils.py
@@ -34,7 +34,7 @@ import PIL.ImageFont as ImageFont
 import six
 import tensorflow as tf
-from official.vision.beta.ops import box_ops
+from official.vision.ops import box_ops
 from official.vision.utils.object_detection import shape_utils
 _TITLE_LEFT_MARGIN = 10

--- a/research/object_detection/core/target_assigner.py
+++ b/research/object_detection/core/target_assigner.py
@@ -1050,6 +1050,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
      else:
        raise ValueError(f'Unknown heatmap type - {self._box_heatmap_type}')
+      heatmap = tf.stop_gradient(heatmap)
      heatmaps.append(heatmap)
    # Return the stacked heatmaps over the batch.

--- a/research/object_detection/meta_architectures/deepmac_meta_arch.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch.py