Merge branch 'purdue-yolo' of https://github.com/tensorflow/models into detection_generator_pr_2

cf80ed4e · anivegesana · 394cefcc · 461b3587 · cf80ed4e · cf80ed4e
Commit cf80ed4e authored Aug 02, 2021 by anivegesana
16 changed files
--- a/official/vision/beta/projects/movinet/modeling/movinet.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet.py
@@ -43,6 +43,9 @@ S12: KernelSize = (1, 2, 2)
 S22: KernelSize = (2, 2, 2)
 S21: KernelSize = (2, 1, 1)
+# Type for a state container (map)
+TensorMap = Mapping[str, tf.Tensor]
 @dataclasses.dataclass
 class BlockSpec:
@@ -319,6 +322,7 @@ class Movinet(tf.keras.Model):
               bias_regularizer: Optional[str] = None,
               stochastic_depth_drop_rate: float = 0.,
               use_external_states: bool = False,
+               output_states: bool = True,
               **kwargs):
    """MoViNet initialization function.
@@ -353,6 +357,10 @@ class Movinet(tf.keras.Model):
      stochastic_depth_drop_rate: the base rate for stochastic depth.
      use_external_states: if True, expects states to be passed as additional
        input.
+      output_states: if True, output intermediate states that can be used to run
+          the model in streaming mode. Inputting the output states of the
+          previous input clip with the current input clip will utilize a stream
+          buffer for streaming video.
      **kwargs: keyword arguments to be passed.
    """
    block_specs = BLOCK_SPECS[model_id]
@@ -385,6 +393,7 @@ class Movinet(tf.keras.Model):
    self._bias_regularizer = bias_regularizer
    self._stochastic_depth_drop_rate = stochastic_depth_drop_rate
    self._use_external_states = use_external_states
+    self._output_states = output_states
    if self._use_external_states and not self._causal:
      raise ValueError('External states should be used with causal mode.')
@@ -411,8 +420,7 @@ class Movinet(tf.keras.Model):
      self,
      input_specs: tf.keras.layers.InputSpec,
      state_specs: Optional[Mapping[str, tf.keras.layers.InputSpec]] = None,
-  ) -> Tuple[Mapping[str, tf.keras.Input], Tuple[Mapping[str, tf.Tensor],
+  ) -> Tuple[TensorMap, Union[TensorMap, Tuple[TensorMap, TensorMap]]]:
-                                                 Mapping[str, tf.Tensor]]]:
    """Builds the model network.
    Args:
@@ -423,7 +431,7 @@ class Movinet(tf.keras.Model):
    Returns:
      Inputs and outputs as a tuple. Inputs are expected to be a dict with
      base input and states. Outputs are expected to be a dict of endpoints
-      and output states.
+      and (optional) output states.
    """
    state_specs = state_specs if state_specs is not None else {}
@@ -519,7 +527,7 @@ class Movinet(tf.keras.Model):
      else:
        raise ValueError('Unknown block type {}'.format(block))
-    outputs = (endpoints, states)
+    outputs = (endpoints, states) if self._output_states else endpoints
    return inputs, outputs
@@ -679,6 +687,8 @@ class Movinet(tf.keras.Model):
        'kernel_regularizer': self._kernel_regularizer,
        'bias_regularizer': self._bias_regularizer,
        'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate,
+        'use_external_states': self._use_external_states,
+        'output_states': self._output_states,
    }
    return config_dict

--- a/official/vision/beta/projects/movinet/modeling/movinet_layers.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet_layers.py
@@ -265,7 +265,7 @@ class ConvBlock(tf.keras.layers.Layer):
      tf.keras.regularizers.L2(KERNEL_WEIGHT_DECAY),
      use_batch_norm: bool = True,
      batch_norm_layer: tf.keras.layers.Layer =
-      tf.keras.layers.experimental.SyncBatchNormalization,
+      tf.keras.layers.BatchNormalization,
      batch_norm_momentum: float = 0.99,
      batch_norm_epsilon: float = 1e-3,
      activation: Optional[Any] = None,
@@ -547,8 +547,8 @@ class StreamConvBlock(ConvBlock):
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = tf.keras
      .regularizers.L2(KERNEL_WEIGHT_DECAY),
      use_batch_norm: bool = True,
-      batch_norm_layer: tf.keras.layers.Layer = tf.keras.layers.experimental
+      batch_norm_layer: tf.keras.layers.Layer =
-      .SyncBatchNormalization,
+      tf.keras.layers.BatchNormalization,
      batch_norm_momentum: float = 0.99,
      batch_norm_epsilon: float = 1e-3,
      activation: Optional[Any] = None,
@@ -915,7 +915,7 @@ class SkipBlock(tf.keras.layers.Layer):
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] =
      tf.keras.regularizers.L2(KERNEL_WEIGHT_DECAY),
      batch_norm_layer: tf.keras.layers.Layer =
-      tf.keras.layers.experimental.SyncBatchNormalization,
+      tf.keras.layers.BatchNormalization,
      batch_norm_momentum: float = 0.99,
      batch_norm_epsilon: float = 1e-3,
      **kwargs):
@@ -1031,8 +1031,8 @@ class MovinetBlock(tf.keras.layers.Layer):
      kernel_initializer: tf.keras.initializers.Initializer = 'HeNormal',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = tf.keras
      .regularizers.L2(KERNEL_WEIGHT_DECAY),
-      batch_norm_layer: tf.keras.layers.Layer = tf.keras.layers.experimental
+      batch_norm_layer: tf.keras.layers.Layer =
-      .SyncBatchNormalization,
+      tf.keras.layers.BatchNormalization,
      batch_norm_momentum: float = 0.99,
      batch_norm_epsilon: float = 1e-3,
      state_prefix: Optional[str] = None,
@@ -1078,7 +1078,6 @@ class MovinetBlock(tf.keras.layers.Layer):
        se_ratio * expand_filters * se_multiplier, divisor=8)
    self._out_filters = out_filters
    self._expand_filters = expand_filters
-    self._kernel_size = kernel_size
    self._causal = causal
    self._activation = activation
    self._gating_activation = gating_activation
@@ -1232,8 +1231,8 @@ class Stem(tf.keras.layers.Layer):
      kernel_initializer: tf.keras.initializers.Initializer = 'HeNormal',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = tf.keras
      .regularizers.L2(KERNEL_WEIGHT_DECAY),
-      batch_norm_layer: tf.keras.layers.Layer = tf.keras.layers.experimental
+      batch_norm_layer: tf.keras.layers.Layer =
-      .SyncBatchNormalization,
+      tf.keras.layers.BatchNormalization,
      batch_norm_momentum: float = 0.99,
      batch_norm_epsilon: float = 1e-3,
      state_prefix: Optional[str] = None,
@@ -1340,8 +1339,8 @@ class Head(tf.keras.layers.Layer):
      kernel_initializer: tf.keras.initializers.Initializer = 'HeNormal',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = tf.keras
      .regularizers.L2(KERNEL_WEIGHT_DECAY),
-      batch_norm_layer: tf.keras.layers.Layer = tf.keras.layers.experimental
+      batch_norm_layer: tf.keras.layers.Layer =
-      .SyncBatchNormalization,
+      tf.keras.layers.BatchNormalization,
      batch_norm_momentum: float = 0.99,
      batch_norm_epsilon: float = 1e-3,
      state_prefix: Optional[str] = None,
@@ -1470,6 +1469,7 @@ class ClassifierHead(tf.keras.layers.Layer):
    self._num_classes = num_classes
    self._dropout_rate = dropout_rate
    self._conv_type = conv_type
+    self._activation = activation
    self._output_activation = output_activation
    self._max_pool_predictions = max_pool_predictions
    self._kernel_initializer = kernel_initializer
@@ -1509,6 +1509,7 @@ class ClassifierHead(tf.keras.layers.Layer):
        'num_classes': self._num_classes,
        'dropout_rate': self._dropout_rate,
        'conv_type': self._conv_type,
+        'activation': self._activation,
        'output_activation': self._output_activation,
        'max_pool_predictions': self._max_pool_predictions,
        'kernel_initializer': self._kernel_initializer,

--- a/official/vision/beta/projects/movinet/modeling/movinet_model.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet_model.py
@@ -36,6 +36,7 @@ class MovinetClassifier(tf.keras.Model):
      backbone: tf.keras.Model,
      num_classes: int,
      input_specs: Optional[Mapping[str, tf.keras.layers.InputSpec]] = None,
+      activation: str = 'swish',
      dropout_rate: float = 0.0,
      kernel_initializer: str = 'HeNormal',
      kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
@@ -48,6 +49,7 @@ class MovinetClassifier(tf.keras.Model):
      backbone: A 3d backbone network.
      num_classes: Number of classes in classification task.
      input_specs: Specs of the input tensor.
+      activation: name of the main activation function.
      dropout_rate: Rate for dropout regularization.
      kernel_initializer: Kernel initializer for the final dense layer.
      kernel_regularizer: Kernel regularizer.
@@ -65,6 +67,7 @@ class MovinetClassifier(tf.keras.Model):
    self._num_classes = num_classes
    self._input_specs = input_specs
+    self._activation = activation
    self._dropout_rate = dropout_rate
    self._kernel_initializer = kernel_initializer
    self._kernel_regularizer = kernel_regularizer
@@ -151,7 +154,8 @@ class MovinetClassifier(tf.keras.Model):
        dropout_rate=self._dropout_rate,
        kernel_initializer=self._kernel_initializer,
        kernel_regularizer=self._kernel_regularizer,
-        conv_type=backbone.conv_type)(
+        conv_type=backbone.conv_type,
+        activation=self._activation)(
            x)
    outputs = (x, states) if self._output_states else x
@@ -180,6 +184,7 @@ class MovinetClassifier(tf.keras.Model):
  def get_config(self):
    config = {
        'backbone': self._backbone,
+        'activation': self._activation,
        'num_classes': self._num_classes,
        'input_specs': self._input_specs,
        'dropout_rate': self._dropout_rate,
@@ -226,6 +231,7 @@ def build_movinet_model(
      num_classes=num_classes,
      kernel_regularizer=l2_regularizer,
      input_specs=input_specs_dict,
+      activation=model_config.activation,
      dropout_rate=model_config.dropout_rate,
      output_states=model_config.output_states)

--- a/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn.py
@@ -15,15 +15,153 @@
 """Panoptic Mask R-CNN configuration definition."""
 import dataclasses
+import os
+from typing import List, Optional
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import optimization
 from official.vision.beta.configs import maskrcnn
 from official.vision.beta.configs import semantic_segmentation
+SEGMENTATION_MODEL = semantic_segmentation.SemanticSegmentationModel
+SEGMENTATION_HEAD = semantic_segmentation.SegmentationHead
+_COCO_INPUT_PATH_BASE = 'coco'
+_COCO_TRAIN_EXAMPLES = 118287
+_COCO_VAL_EXAMPLES = 5000
+# pytype: disable=wrong-keyword-args
+@dataclasses.dataclass
+class Parser(maskrcnn.Parser):
+  """Panoptic Mask R-CNN parser config."""
+  # If segmentation_resize_eval_groundtruth is set to False, original image
+  # sizes are used for eval. In that case,
+  # segmentation_groundtruth_padded_size has to be specified too to allow for
+  # batching the variable input sizes of images.
+  segmentation_resize_eval_groundtruth: bool = True
+  segmentation_groundtruth_padded_size: List[int] = dataclasses.field(
+      default_factory=list)
+  segmentation_ignore_label: int = 255
+@dataclasses.dataclass
+class DataConfig(maskrcnn.DataConfig):
+  """Input config for training."""
+  parser: Parser = Parser()
 @dataclasses.dataclass
 class PanopticMaskRCNN(maskrcnn.MaskRCNN):
  """Panoptic Mask R-CNN model config."""
  segmentation_model: semantic_segmentation.SemanticSegmentationModel = (
-      semantic_segmentation.SemanticSegmentationModel(num_classes=2))
+      SEGMENTATION_MODEL(num_classes=2))
+  include_mask = True
  shared_backbone: bool = True
  shared_decoder: bool = True
+@dataclasses.dataclass
+class Losses(maskrcnn.Losses):
+  """Panoptic Mask R-CNN loss config."""
+  semantic_segmentation_label_smoothing: float = 0.0
+  semantic_segmentation_ignore_label: int = 255
+  semantic_segmentation_class_weights: List[float] = dataclasses.field(
+      default_factory=list)
+  semantic_segmentation_use_groundtruth_dimension: bool = True
+  semantic_segmentation_top_k_percent_pixels: float = 1.0
+  semantic_segmentation_weight: float = 1.0
+@dataclasses.dataclass
+class PanopticMaskRCNNTask(maskrcnn.MaskRCNNTask):
+  """Panoptic Mask R-CNN task config."""
+  model: PanopticMaskRCNN = PanopticMaskRCNN()
+  train_data: DataConfig = DataConfig(is_training=True)
+  validation_data: DataConfig = DataConfig(is_training=False,
+                                           drop_remainder=False)
+  segmentation_evaluation: semantic_segmentation.Evaluation = semantic_segmentation.Evaluation()  # pylint: disable=line-too-long
+  losses: Losses = Losses()
+  init_checkpoint: Optional[str] = None
+  segmentation_init_checkpoint: Optional[str] = None
+  # 'init_checkpoint_modules' controls the modules that need to be initialized
+  # from checkpoint paths given by 'init_checkpoint' and/or
+  # 'segmentation_init_checkpoint. Supports modules:
+  # 'backbone': Initialize MaskRCNN backbone
+  # 'segmentation_backbone': Initialize segmentation backbone
+  # 'segmentation_decoder': Initialize segmentation decoder
+  # 'all': Initialize all modules
+  init_checkpoint_modules: Optional[List[str]] = dataclasses.field(
+      default_factory=list)
+@exp_factory.register_config_factory('panoptic_maskrcnn_resnetfpn_coco')
+def panoptic_maskrcnn_resnetfpn_coco() -> cfg.ExperimentConfig:
+  """COCO panoptic segmentation with Panoptic Mask R-CNN."""
+  train_batch_size = 64
+  eval_batch_size = 8
+  steps_per_epoch = _COCO_TRAIN_EXAMPLES // train_batch_size
+  validation_steps = _COCO_VAL_EXAMPLES // eval_batch_size
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=PanopticMaskRCNNTask(
+          init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080',  # pylint: disable=line-too-long
+          init_checkpoint_modules=['backbone'],
+          model=PanopticMaskRCNN(
+              num_classes=91, input_size=[1024, 1024, 3],
+              segmentation_model=SEGMENTATION_MODEL(
+                  num_classes=91,
+                  head=SEGMENTATION_HEAD(level=3))),
+          losses=Losses(l2_weight_decay=0.00004),
+          train_data=DataConfig(
+              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.25)),
+          validation_data=DataConfig(
+              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=False),
+          annotation_file=os.path.join(_COCO_INPUT_PATH_BASE,
+                                       'instances_val2017.json')),
+      trainer=cfg.TrainerConfig(
+          train_steps=22500,
+          validation_steps=validation_steps,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [15000, 20000],
+                      'values': [0.12, 0.012, 0.0012],
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 500,
+                      'warmup_learning_rate': 0.0067
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
--- a/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for panoptic maskrcnn config."""
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import tensorflow as tf
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn as exp_cfg
+class PanopticMaskRCNNConfigTest(tf.test.TestCase, parameterized.TestCase):
+  @parameterized.parameters(
+      ('panoptic_maskrcnn_resnetfpn_coco',),
+  )
+  def test_panoptic_maskrcnn_configs(self, config_name):
+    config = exp_factory.get_exp_config(config_name)
+    self.assertIsInstance(config, cfg.ExperimentConfig)
+    self.assertIsInstance(config.task, exp_cfg.PanopticMaskRCNNTask)
+    self.assertIsInstance(config.task.model, exp_cfg.PanopticMaskRCNN)
+    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
+    config.validate()
+    config.task.train_data.is_training = None
+    with self.assertRaisesRegex(KeyError, 'Found inconsistncy between key'):
+      config.validate()
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/panoptic_maskrcnn/dataloaders/panoptic_maskrcnn_input.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/dataloaders/panoptic_maskrcnn_input.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data parser and processing for Panoptic Mask R-CNN."""
+import tensorflow as tf
+from official.vision.beta.dataloaders import maskrcnn_input
+from official.vision.beta.dataloaders import tf_example_decoder
+from official.vision.beta.ops import preprocess_ops
+class TfExampleDecoder(tf_example_decoder.TfExampleDecoder):
+  """Tensorflow Example proto decoder."""
+  def __init__(self, regenerate_source_id, mask_binarize_threshold):
+    super(TfExampleDecoder, self).__init__(
+        include_mask=True,
+        regenerate_source_id=regenerate_source_id,
+        mask_binarize_threshold=None)
+    self._segmentation_keys_to_features = {
+        'image/segmentation/class/encoded':
+            tf.io.FixedLenFeature((), tf.string, default_value='')
+    }
+  def decode(self, serialized_example):
+    decoded_tensors = super(TfExampleDecoder, self).decode(serialized_example)
+    segmentation_parsed_tensors = tf.io.parse_single_example(
+        serialized_example, self._segmentation_keys_to_features)
+    segmentation_mask = tf.io.decode_image(
+        segmentation_parsed_tensors['image/segmentation/class/encoded'],
+        channels=1)
+    segmentation_mask.set_shape([None, None, 1])
+    decoded_tensors.update({'groundtruth_segmentation_mask': segmentation_mask})
+    return decoded_tensors
+class Parser(maskrcnn_input.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+  def __init__(self,
+               output_size,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               rpn_match_threshold=0.7,
+               rpn_unmatched_threshold=0.3,
+               rpn_batch_size_per_im=256,
+               rpn_fg_fraction=0.5,
+               aug_rand_hflip=False,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               skip_crowd_during_training=True,
+               max_num_instances=100,
+               mask_crop_size=112,
+               segmentation_resize_eval_groundtruth=True,
+               segmentation_groundtruth_padded_size=None,
+               segmentation_ignore_label=255,
+               dtype='float32'):
+    """Initializes parameters for parsing annotations in the dataset.
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      min_level: `int` number of minimum level of the output feature pyramid.
+      max_level: `int` number of maximum level of the output feature pyramid.
+      num_scales: `int` number representing intermediate scales added
+        on each level. For instance, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: `list` of float numbers representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instance, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: `float` number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      rpn_match_threshold: `float`, match threshold for anchors in RPN.
+      rpn_unmatched_threshold: `float`, unmatched threshold for anchors in RPN.
+      rpn_batch_size_per_im: `int` for batch size per image in RPN.
+      rpn_fg_fraction: `float` for forground fraction per batch in RPN.
+      aug_rand_hflip: `bool`, if True, augment training with random
+        horizontal flip.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
+        `is_crowd` equals to 1.
+      max_num_instances: `int` number of maximum number of instances in an
+        image. The groundtruth data will be padded to `max_num_instances`.
+      mask_crop_size: the size which groundtruth mask is cropped to.
+      segmentation_resize_eval_groundtruth: `bool`, if True, eval groundtruth
+        masks are resized to output_size.
+      segmentation_groundtruth_padded_size: `Tensor` or `list` for [height,
+        width]. When resize_eval_groundtruth is set to False, the groundtruth
+        masks are padded to this size.
+      segmentation_ignore_label: `int` the pixel with ignore label will not used
+        for training and evaluation.
+      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
+    """
+    super(Parser, self).__init__(
+        output_size=output_size,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size,
+        rpn_match_threshold=rpn_match_threshold,
+        rpn_unmatched_threshold=rpn_unmatched_threshold,
+        rpn_batch_size_per_im=rpn_batch_size_per_im,
+        rpn_fg_fraction=rpn_fg_fraction,
+        aug_rand_hflip=False,
+        aug_scale_min=aug_scale_min,
+        aug_scale_max=aug_scale_max,
+        skip_crowd_during_training=skip_crowd_during_training,
+        max_num_instances=max_num_instances,
+        include_mask=True,
+        mask_crop_size=mask_crop_size,
+        dtype=dtype)
+    self.aug_rand_hflip = aug_rand_hflip
+    self._segmentation_resize_eval_groundtruth = segmentation_resize_eval_groundtruth
+    if (not segmentation_resize_eval_groundtruth) and (
+        segmentation_groundtruth_padded_size is None):
+      raise ValueError(
+          'segmentation_groundtruth_padded_size ([height, width]) needs to be'
+          'specified when segmentation_resize_eval_groundtruth is False.')
+    self._segmentation_groundtruth_padded_size = segmentation_groundtruth_padded_size
+    self._segmentation_ignore_label = segmentation_ignore_label
+  def _parse_train_data(self, data):
+    """Parses data for training.
+    Args:
+      data: the decoded tensor dictionary from TfExampleDecoder.
+    Returns:
+      image: image tensor that is preproessed to have normalized value and
+        dimension [output_size[0], output_size[1], 3]
+      labels: a dictionary of tensors used for training. The following describes
+        {key: value} pairs in the dictionary.
+        image_info: a 2D `Tensor` that encodes the information of the image and
+          the applied preprocessing. It is in the format of
+          [[original_height, original_width], [scaled_height, scaled_width]],
+        anchor_boxes: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, 4] representing anchor boxes at each level.
+        rpn_score_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location]. The height_l and
+          width_l represent the dimension of class logits at l-th level.
+        rpn_box_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
+          width_l represent the dimension of bounding box regression output at
+          l-th level.
+        gt_boxes: Groundtruth bounding box annotations. The box is represented
+           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
+           image that is fed to the network. The tennsor is padded with -1 to
+           the fixed dimension [self._max_num_instances, 4].
+        gt_classes: Groundtruth classes annotations. The tennsor is padded
+          with -1 to the fixed dimension [self._max_num_instances].
+        gt_masks: Groundtruth masks cropped by the bounding box and
+          resized to a fixed size determined by mask_crop_size.
+        gt_segmentation_mask: Groundtruth mask for segmentation head, this is
+          resized to a fixed size determined by output_size.
+        gt_segmentation_valid_mask: Binary mask that marks the pixels that
+          are supposed to be used in computing the segmentation loss while
+          training.
+    """
+    segmentation_mask = data['groundtruth_segmentation_mask']
+    # Flips image randomly during training.
+    if self.aug_rand_hflip:
+      masks = data['groundtruth_instance_masks']
+      image_mask = tf.concat([data['image'], segmentation_mask], axis=2)
+      image_mask, boxes, masks = preprocess_ops.random_horizontal_flip(
+          image_mask, data['groundtruth_boxes'], masks)
+      segmentation_mask = image_mask[:, :, -1:]
+      image = image_mask[:, :, :-1]
+      data['image'] = image
+      data['boxes'] = boxes
+      data['masks'] = masks
+    image, labels = super(Parser, self)._parse_train_data(data)
+    image_info = labels['image_info']
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    segmentation_mask = tf.reshape(
+        segmentation_mask, shape=[1, data['height'], data['width']])
+    segmentation_mask = tf.cast(segmentation_mask, tf.float32)
+    # Pad label and make sure the padded region assigned to the ignore label.
+    # The label is first offset by +1 and then padded with 0.
+    segmentation_mask += 1
+    segmentation_mask = tf.expand_dims(segmentation_mask, axis=3)
+    segmentation_mask = preprocess_ops.resize_and_crop_masks(
+        segmentation_mask, image_scale, self._output_size, offset)
+    segmentation_mask -= 1
+    segmentation_mask = tf.where(
+        tf.equal(segmentation_mask, -1),
+        self._segmentation_ignore_label * tf.ones_like(segmentation_mask),
+        segmentation_mask)
+    segmentation_mask = tf.squeeze(segmentation_mask, axis=0)
+    segmentation_valid_mask = tf.not_equal(
+        segmentation_mask, self._segmentation_ignore_label)
+    labels.update({
+        'gt_segmentation_mask': segmentation_mask,
+        'gt_segmentation_valid_mask': segmentation_valid_mask})
+    return image, labels
+  def _parse_eval_data(self, data):
+    """Parses data for evaluation.
+    Args:
+      data: the decoded tensor dictionary from TfExampleDecoder.
+    Returns:
+      A dictionary of {'images': image, 'labels': labels} where
+        image: image tensor that is preproessed to have normalized value and
+          dimension [output_size[0], output_size[1], 3]
+        labels: a dictionary of tensors used for training. The following
+          describes {key: value} pairs in the dictionary.
+          source_ids: Source image id. Default value -1 if the source id is
+            empty in the groundtruth annotation.
+          image_info: a 2D `Tensor` that encodes the information of the image
+            and the applied preprocessing. It is in the format of
+            [[original_height, original_width], [scaled_height, scaled_width]],
+          anchor_boxes: ordered dictionary with keys
+            [min_level, min_level+1, ..., max_level]. The values are tensor with
+            shape [height_l, width_l, 4] representing anchor boxes at each
+            level.
+    """
+    segmentation_mask = tf.cast(
+        data['groundtruth_segmentation_mask'], tf.float32)
+    segmentation_mask = tf.reshape(
+        segmentation_mask, shape=[1, data['height'], data['width'], 1])
+    segmentation_mask += 1
+    image, labels = super(Parser, self)._parse_eval_data(data)
+    if self._segmentation_resize_eval_groundtruth:
+      # Resizes eval masks to match input image sizes. In that case, mean IoU
+      # is computed on output_size not the original size of the images.
+      image_info = labels['image_info']
+      image_scale = image_info[2, :]
+      offset = image_info[3, :]
+      segmentation_mask = preprocess_ops.resize_and_crop_masks(
+          segmentation_mask, image_scale, self._output_size, offset)
+    else:
+      segmentation_mask = tf.image.pad_to_bounding_box(
+          segmentation_mask, 0, 0,
+          self._segmentation_groundtruth_padded_size[0],
+          self._segmentation_groundtruth_padded_size[1])
+    segmentation_mask -= 1
+    # Assign ignore label to the padded region.
+    segmentation_mask = tf.where(
+        tf.equal(segmentation_mask, -1),
+        self._segmentation_ignore_label * tf.ones_like(segmentation_mask),
+        segmentation_mask)
+    segmentation_mask = tf.squeeze(segmentation_mask, axis=0)
+    segmentation_valid_mask = tf.not_equal(
+        segmentation_mask, self._segmentation_ignore_label)
+    labels['groundtruths'].update({
+        'gt_segmentation_mask': segmentation_mask,
+        'gt_segmentation_valid_mask': segmentation_valid_mask})
+    return image, labels
--- a/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_maskrcnn.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_maskrcnn.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Panoptic MaskRCNN task definition."""
+from typing import Any, List, Mapping, Optional, Tuple, Dict
+from absl import logging
+import tensorflow as tf
+from official.common import dataset_fn
+from official.core import task_factory
+from official.vision.beta.dataloaders import input_reader_factory
+from official.vision.beta.evaluation import coco_evaluator
+from official.vision.beta.evaluation import segmentation_metrics
+from official.vision.beta.losses import segmentation_losses
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn as exp_cfg
+from official.vision.beta.projects.panoptic_maskrcnn.dataloaders import panoptic_maskrcnn_input
+from official.vision.beta.projects.panoptic_maskrcnn.modeling import factory
+from official.vision.beta.tasks import maskrcnn
+@task_factory.register_task_cls(exp_cfg.PanopticMaskRCNNTask)
+class PanopticMaskRCNNTask(maskrcnn.MaskRCNNTask):
+  """A single-replica view of training procedure.
+  Panoptic Mask R-CNN task provides artifacts for training/evalution procedures,
+  including loading/iterating over Datasets, initializing the model, calculating
+  the loss, post-processing, and customized metrics with reduction.
+  """
+  def build_model(self) -> tf.keras.Model:
+    """Build Panoptic Mask R-CNN model."""
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None] + self.task_config.model.input_size)
+    l2_weight_decay = self.task_config.losses.l2_weight_decay
+    # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss.
+    # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2)
+    # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss)
+    l2_regularizer = (tf.keras.regularizers.l2(
+        l2_weight_decay / 2.0) if l2_weight_decay else None)
+    model = factory.build_panoptic_maskrcnn(
+        input_specs=input_specs,
+        model_config=self.task_config.model,
+        l2_regularizer=l2_regularizer)
+    return model
+  def initialize(self, model: tf.keras.Model) -> None:
+    """Loading pretrained checkpoint."""
+    if not self.task_config.init_checkpoint_modules:
+      return
+    def _get_checkpoint_path(checkpoint_dir_or_file):
+      if tf.io.gfile.isdir(checkpoint_dir_or_file):
+        checkpoint_path = tf.train.latest_checkpoint(
+            checkpoint_dir_or_file)
+      return checkpoint_path
+    for init_module in self.task_config.init_checkpoint_modules:
+      # Restoring checkpoint.
+      if init_module == 'all':
+        checkpoint_path = _get_checkpoint_path(
+            self.task_config.init_checkpoint)
+        ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+        status = ckpt.restore(checkpoint_path)
+        status.assert_consumed()
+      elif init_module == 'backbone':
+        checkpoint_path = _get_checkpoint_path(
+            self.task_config.init_checkpoint)
+        ckpt = tf.train.Checkpoint(backbone=model.backbone)
+        status = ckpt.restore(checkpoint_path)
+        status.expect_partial().assert_existing_objects_matched()
+      elif init_module == 'segmentation_backbone':
+        checkpoint_path = _get_checkpoint_path(
+            self.task_config.segmentation_init_checkpoint)
+        ckpt = tf.train.Checkpoint(
+            segmentation_backbone=model.segmentation_backbone)
+        status = ckpt.restore(checkpoint_path)
+        status.expect_partial().assert_existing_objects_matched()
+      elif init_module == 'segmentation_decoder':
+        checkpoint_path = _get_checkpoint_path(
+            self.task_config.segmentation_init_checkpoint)
+        ckpt = tf.train.Checkpoint(
+            segmentation_decoder=model.segmentation_decoder)
+        status = ckpt.restore(checkpoint_path)
+        status.expect_partial().assert_existing_objects_matched()
+      else:
+        raise ValueError(
+            "Only 'all', 'backbone', 'segmentation_backbone' and/or "
+            "segmentation_backbone' can be used to initialize the model, but "
+            "got {}".format(init_module))
+      logging.info('Finished loading pretrained checkpoint from %s for %s',
+                   checkpoint_path, init_module)
+  def build_inputs(
+      self,
+      params: exp_cfg.DataConfig,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Build input dataset."""
+    decoder_cfg = params.decoder.get()
+    if params.decoder.type == 'simple_decoder':
+      decoder = panoptic_maskrcnn_input.TfExampleDecoder(
+          regenerate_source_id=decoder_cfg.regenerate_source_id,
+          mask_binarize_threshold=decoder_cfg.mask_binarize_threshold)
+    else:
+      raise ValueError('Unknown decoder type: {}!'.format(params.decoder.type))
+    parser = panoptic_maskrcnn_input.Parser(
+        output_size=self.task_config.model.input_size[:2],
+        min_level=self.task_config.model.min_level,
+        max_level=self.task_config.model.max_level,
+        num_scales=self.task_config.model.anchor.num_scales,
+        aspect_ratios=self.task_config.model.anchor.aspect_ratios,
+        anchor_size=self.task_config.model.anchor.anchor_size,
+        dtype=params.dtype,
+        rpn_match_threshold=params.parser.rpn_match_threshold,
+        rpn_unmatched_threshold=params.parser.rpn_unmatched_threshold,
+        rpn_batch_size_per_im=params.parser.rpn_batch_size_per_im,
+        rpn_fg_fraction=params.parser.rpn_fg_fraction,
+        aug_rand_hflip=params.parser.aug_rand_hflip,
+        aug_scale_min=params.parser.aug_scale_min,
+        aug_scale_max=params.parser.aug_scale_max,
+        skip_crowd_during_training=params.parser.skip_crowd_during_training,
+        max_num_instances=params.parser.max_num_instances,
+        mask_crop_size=params.parser.mask_crop_size,
+        segmentation_resize_eval_groundtruth=params.parser
+        .segmentation_resize_eval_groundtruth,
+        segmentation_groundtruth_padded_size=params.parser
+        .segmentation_groundtruth_padded_size,
+        segmentation_ignore_label=params.parser.segmentation_ignore_label)
+    reader = input_reader_factory.input_reader_generator(
+        params,
+        dataset_fn=dataset_fn.pick_dataset_fn(params.file_type),
+        decoder_fn=decoder.decode,
+        parser_fn=parser.parse_fn(params.is_training))
+    dataset = reader.read(input_context=input_context)
+    return dataset
+  def build_losses(self,
+                   outputs: Mapping[str, Any],
+                   labels: Mapping[str, Any],
+                   aux_losses: Optional[Any] = None) -> Dict[str, tf.Tensor]:
+    """Build Panoptic Mask R-CNN losses."""
+    params = self.task_config.losses
+    use_groundtruth_dimension = params.semantic_segmentation_use_groundtruth_dimension
+    segmentation_loss_fn = segmentation_losses.SegmentationLoss(
+        label_smoothing=params.semantic_segmentation_label_smoothing,
+        class_weights=params.semantic_segmentation_class_weights,
+        ignore_label=params.semantic_segmentation_ignore_label,
+        use_groundtruth_dimension=use_groundtruth_dimension,
+        top_k_percent_pixels=params.semantic_segmentation_top_k_percent_pixels)
+    semantic_segmentation_weight = params.semantic_segmentation_weight
+    losses = super(PanopticMaskRCNNTask, self).build_losses(
+        outputs=outputs,
+        labels=labels,
+        aux_losses=None)
+    maskrcnn_loss = losses['model_loss']
+    segmentation_loss = segmentation_loss_fn(
+        outputs['segmentation_outputs'],
+        labels['gt_segmentation_mask'])
+    model_loss = (
+        maskrcnn_loss + semantic_segmentation_weight * segmentation_loss)
+    total_loss = model_loss
+    if aux_losses:
+      reg_loss = tf.reduce_sum(aux_losses)
+      total_loss = model_loss + reg_loss
+    losses.update({
+        'total_loss': total_loss,
+        'maskrcnn_loss': maskrcnn_loss,
+        'segmentation_loss': segmentation_loss,
+        'model_loss': model_loss,
+    })
+    return losses
+  def build_metrics(self, training: bool = True) -> List[
+      tf.keras.metrics.Metric]:
+    """Build detection metrics."""
+    metrics = []
+    if training:
+      metric_names = [
+          'total_loss',
+          'rpn_score_loss',
+          'rpn_box_loss',
+          'frcnn_cls_loss',
+          'frcnn_box_loss',
+          'mask_loss',
+          'maskrcnn_loss',
+          'segmentation_loss',
+          'model_loss'
+      ]
+      for name in metric_names:
+        metrics.append(tf.keras.metrics.Mean(name, dtype=tf.float32))
+      if self.task_config.segmentation_evaluation.report_train_mean_iou:
+        self.segmentation_train_mean_iou = segmentation_metrics.MeanIoU(
+            name='train_mean_iou',
+            num_classes=self.task_config.model.num_classes,
+            rescale_predictions=False,
+            dtype=tf.float32)
+    else:
+      self.coco_metric = coco_evaluator.COCOEvaluator(
+          annotation_file=self.task_config.annotation_file,
+          include_mask=self.task_config.model.include_mask,
+          per_category_metrics=self.task_config.per_category_metrics)
+      rescale_predictions = (not self.task_config.validation_data.parser
+                             .segmentation_resize_eval_groundtruth)
+      self.segmentation_perclass_iou_metric = segmentation_metrics.PerClassIoU(
+          name='per_class_iou',
+          num_classes=self.task_config.model.num_classes,
+          rescale_predictions=rescale_predictions,
+          dtype=tf.float32)
+    return metrics
+  def train_step(self,
+                 inputs: Tuple[Any, Any],
+                 model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer,
+                 metrics: Optional[List[Any]] = None) -> Dict[str, Any]:
+    """Does forward and backward.
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+    Returns:
+      A dictionary of logs.
+    """
+    images, labels = inputs
+    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
+    with tf.GradientTape() as tape:
+      outputs = model(
+          images,
+          image_shape=labels['image_info'][:, 1, :],
+          anchor_boxes=labels['anchor_boxes'],
+          gt_boxes=labels['gt_boxes'],
+          gt_classes=labels['gt_classes'],
+          gt_masks=(labels['gt_masks'] if self.task_config.model.include_mask
+                    else None),
+          training=True)
+      outputs = tf.nest.map_structure(
+          lambda x: tf.cast(x, tf.float32), outputs)
+      # Computes per-replica loss.
+      losses = self.build_losses(
+          outputs=outputs, labels=labels, aux_losses=model.losses)
+      scaled_loss = losses['total_loss'] / num_replicas
+      # For mixed_precision policy, when LossScaleOptimizer is used, loss is
+      # scaled for numerical stability.
+      if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    # Scales back gradient when LossScaleOptimizer is used.
+    if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+      grads = optimizer.get_unscaled_gradients(grads)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    logs = {self.loss: losses['total_loss']}
+    if metrics:
+      for m in metrics:
+        m.update_state(losses[m.name])
+    if self.task_config.segmentation_evaluation.report_train_mean_iou:
+      segmentation_labels = {
+          'masks': labels['gt_segmentation_mask'],
+          'valid_masks': labels['gt_segmentation_valid_mask'],
+          'image_info': labels['image_info']
+      }
+      self.process_metrics(
+          metrics=[self.segmentation_train_mean_iou],
+          labels=segmentation_labels,
+          model_outputs=outputs['segmentation_outputs'])
+      logs.update({
+          self.segmentation_train_mean_iou.name:
+              self.segmentation_train_mean_iou.result()
+      })
+    return logs
+  def validation_step(self,
+                      inputs: Tuple[Any, Any],
+                      model: tf.keras.Model,
+                      metrics: Optional[List[Any]] = None) -> Dict[str, Any]:
+    """Validatation step.
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+    Returns:
+      A dictionary of logs.
+    """
+    images, labels = inputs
+    outputs = model(
+        images,
+        anchor_boxes=labels['anchor_boxes'],
+        image_shape=labels['image_info'][:, 1, :],
+        training=False)
+    logs = {self.loss: 0}
+    coco_model_outputs = {
+        'detection_masks': outputs['detection_masks'],
+        'detection_boxes': outputs['detection_boxes'],
+        'detection_scores': outputs['detection_scores'],
+        'detection_classes': outputs['detection_classes'],
+        'num_detections': outputs['num_detections'],
+        'source_id': labels['groundtruths']['source_id'],
+        'image_info': labels['image_info']
+    }
+    segmentation_labels = {
+        'masks': labels['groundtruths']['gt_segmentation_mask'],
+        'valid_masks': labels['groundtruths']['gt_segmentation_valid_mask'],
+        'image_info': labels['image_info']
+    }
+    logs.update({
+        self.coco_metric.name: (labels['groundtruths'], coco_model_outputs),
+        self.segmentation_perclass_iou_metric.name: (
+            segmentation_labels,
+            outputs['segmentation_outputs'])
+    })
+    return logs
+  def aggregate_logs(self, state=None, step_outputs=None):
+    if state is None:
+      self.coco_metric.reset_states()
+      self.segmentation_perclass_iou_metric.reset_states()
+      state = [self.coco_metric, self.segmentation_perclass_iou_metric]
+    self.coco_metric.update_state(
+        step_outputs[self.coco_metric.name][0],
+        step_outputs[self.coco_metric.name][1])
+    self.segmentation_perclass_iou_metric.update_state(
+        step_outputs[self.segmentation_perclass_iou_metric.name][0],
+        step_outputs[self.segmentation_perclass_iou_metric.name][1])
+    return state
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+    result = {}
+    result[self.coco_metric.name] = super(
+        PanopticMaskRCNNTask, self).reduce_aggregated_logs(
+            aggregated_logs=aggregated_logs,
+            global_step=global_step)
+    ious = self.segmentation_perclass_iou_metric.result()
+    if self.task_config.segmentation_evaluation.report_per_class_iou:
+      for i, value in enumerate(ious.numpy()):
+        result.update({'segmentation_iou/class_{}'.format(i): value})
+    # Computes mean IoU
+    result.update({'segmentation_mean_iou': tf.reduce_mean(ious).numpy()})
+    return result
--- a/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_maskrcnn_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/tasks/panoptic_maskrcnn_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for panoptic_maskrcnn.py."""
+import os
+from absl.testing import parameterized
+import tensorflow as tf
+from official.vision.beta.configs import decoders as decoder_cfg
+from official.vision.beta.configs import semantic_segmentation as segmentation_cfg
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn as cfg
+from official.vision.beta.projects.panoptic_maskrcnn.tasks import panoptic_maskrcnn
+class PanopticMaskRCNNTaskTest(tf.test.TestCase, parameterized.TestCase):
+  @parameterized.parameters(
+      (['all'],),
+      (['backbone'],),
+      (['segmentation_backbone'],),
+      (['segmentation_decoder'],),
+      (['backbone', 'segmentation_backbone'],),
+      (['segmentation_backbone', 'segmentation_decoder'],))
+  def test_model_initializing(self, init_checkpoint_modules):
+    shared_backbone = ('segmentation_backbone' not in init_checkpoint_modules)
+    shared_decoder = ('segmentation_decoder' not in init_checkpoint_modules and
+                      shared_backbone)
+    task_config = cfg.PanopticMaskRCNNTask(
+        model=cfg.PanopticMaskRCNN(
+            num_classes=2,
+            input_size=[640, 640, 3],
+            segmentation_model=segmentation_cfg.SemanticSegmentationModel(
+                decoder=decoder_cfg.Decoder(type='fpn')),
+            shared_backbone=shared_backbone,
+            shared_decoder=shared_decoder))
+    task = panoptic_maskrcnn.PanopticMaskRCNNTask(task_config)
+    model = task.build_model()
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    ckpt_save_dir = self.create_tempdir().full_path
+    ckpt.save(os.path.join(ckpt_save_dir, 'ckpt'))
+    if (init_checkpoint_modules == ['all'] or
+        'backbone' in init_checkpoint_modules):
+      task._task_config.init_checkpoint = ckpt_save_dir
+    if ('segmentation_backbone' in init_checkpoint_modules or
+        'segmentation_decoder' in init_checkpoint_modules):
+      task._task_config.segmentation_init_checkpoint = ckpt_save_dir
+    task._task_config.init_checkpoint_modules = init_checkpoint_modules
+    task.initialize(model)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/simclr/configs/multitask_config.py
+++ b/official/vision/beta/projects/simclr/configs/multitask_config.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-task SimCLR configs."""
+import dataclasses
+from typing import List, Tuple
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.modeling.multitask import configs as multitask_configs
+from official.vision.beta.configs import backbones
+from official.vision.beta.configs import common
+from official.vision.beta.projects.simclr.configs import simclr as simclr_configs
+from official.vision.beta.projects.simclr.modeling import simclr_model
+@dataclasses.dataclass
+class SimCLRMTHeadConfig(hyperparams.Config):
+  """Per-task specific configs."""
+  # Supervised head is required for finetune, but optional for pretrain.
+  supervised_head: simclr_configs.SupervisedHead = simclr_configs.SupervisedHead(
+      num_classes=1001)
+  mode: str = simclr_model.PRETRAIN
+@dataclasses.dataclass
+class SimCLRMTModelConfig(hyperparams.Config):
+  """Model config for multi-task SimCLR model."""
+  input_size: List[int] = dataclasses.field(default_factory=list)
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='resnet', resnet=backbones.ResNet())
+  backbone_trainable: bool = True
+  projection_head: simclr_configs.ProjectionHead = simclr_configs.ProjectionHead(
+      proj_output_dim=128, num_proj_layers=3, ft_proj_idx=1)
+  norm_activation: common.NormActivation = common.NormActivation(
+      norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)
+  heads: Tuple[SimCLRMTHeadConfig, ...] = ()
+  # L2 weight decay is used in the model, not in task.
+  # Note that this can not be used together with lars optimizer.
+  l2_weight_decay: float = 0.0
+@exp_factory.register_config_factory('multitask_simclr')
+def multitask_simclr() -> multitask_configs.MultiTaskExperimentConfig:
+  return multitask_configs.MultiTaskExperimentConfig(
+      task=multitask_configs.MultiTaskConfig(
+          model=SimCLRMTModelConfig(
+              heads=(SimCLRMTHeadConfig(mode=simclr_model.PRETRAIN),
+                     SimCLRMTHeadConfig(mode=simclr_model.FINETUNE))),
+          task_routines=(multitask_configs.TaskRoutine(
+              task_name=simclr_model.PRETRAIN,
+              task_config=simclr_configs.SimCLRPretrainTask(),
+              task_weight=2.0),
+                         multitask_configs.TaskRoutine(
+                             task_name=simclr_model.FINETUNE,
+                             task_config=simclr_configs.SimCLRFinetuneTask(),
+                             task_weight=1.0))),
+      trainer=multitask_configs.MultiTaskTrainerConfig())
--- a/official/vision/beta/projects/simclr/configs/multitask_config_test.py
+++ b/official/vision/beta/projects/simclr/configs/multitask_config_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for multitask_config."""
+import tensorflow as tf
+from official.core import exp_factory
+from official.modeling.multitask import configs as multitask_configs
+from official.vision.beta.projects.simclr.configs import multitask_config as simclr_multitask_config
+from official.vision.beta.projects.simclr.configs import simclr as exp_cfg
+class MultitaskConfigTest(tf.test.TestCase):
+  def test_simclr_configs(self):
+    config = exp_factory.get_exp_config('multitask_simclr')
+    self.assertIsInstance(config, multitask_configs.MultiTaskExperimentConfig)
+    self.assertIsInstance(config.task.model,
+                          simclr_multitask_config.SimCLRMTModelConfig)
+    self.assertIsInstance(config.task.task_routines[0].task_config,
+                          exp_cfg.SimCLRPretrainTask)
+    self.assertIsInstance(config.task.task_routines[1].task_config,
+                          exp_cfg.SimCLRFinetuneTask)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/simclr/modeling/multitask_model.py
+++ b/official/vision/beta/projects/simclr/modeling/multitask_model.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-task image multi-taskSimCLR model definition."""
+from typing import Dict, Text
+import tensorflow as tf
+from official.modeling.multitask import base_model
+from official.vision.beta.modeling import backbones
+from official.vision.beta.projects.simclr.configs import multitask_config as simclr_multitask_config
+from official.vision.beta.projects.simclr.heads import simclr_head
+from official.vision.beta.projects.simclr.modeling import simclr_model
+PROJECTION_OUTPUT_KEY = 'projection_outputs'
+SUPERVISED_OUTPUT_KEY = 'supervised_outputs'
+class SimCLRMTModel(base_model.MultiTaskBaseModel):
+  """A multi-task SimCLR model that does both pretrain and finetune."""
+  def __init__(self, config: simclr_multitask_config.SimCLRMTModelConfig,
+               **kwargs):
+    self._config = config
+    # Build shared backbone.
+    self._input_specs = tf.keras.layers.InputSpec(shape=[None] +
+                                                  config.input_size)
+    l2_weight_decay = config.l2_weight_decay
+    # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss.
+    # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2)
+    # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss)
+    self._l2_regularizer = (
+        tf.keras.regularizers.l2(l2_weight_decay /
+                                 2.0) if l2_weight_decay else None)
+    self._backbone = backbones.factory.build_backbone(
+        input_specs=self._input_specs,
+        backbone_config=config.backbone,
+        norm_activation_config=config.norm_activation,
+        l2_regularizer=self._l2_regularizer)
+    super().__init__(**kwargs)
+  def _instantiate_sub_tasks(self) -> Dict[Text, tf.keras.Model]:
+    tasks = {}
+    # Build the shared projection head
+    norm_activation_config = self._config.norm_activation
+    projection_head_config = self._config.projection_head
+    projection_head = simclr_head.ProjectionHead(
+        proj_output_dim=projection_head_config.proj_output_dim,
+        num_proj_layers=projection_head_config.num_proj_layers,
+        ft_proj_idx=projection_head_config.ft_proj_idx,
+        kernel_regularizer=self._l2_regularizer,
+        use_sync_bn=norm_activation_config.use_sync_bn,
+        norm_momentum=norm_activation_config.norm_momentum,
+        norm_epsilon=norm_activation_config.norm_epsilon)
+    for model_config in self._config.heads:
+      # Build supervised head
+      supervised_head_config = model_config.supervised_head
+      if supervised_head_config:
+        if supervised_head_config.zero_init:
+          s_kernel_initializer = 'zeros'
+        else:
+          s_kernel_initializer = 'random_uniform'
+        supervised_head = simclr_head.ClassificationHead(
+            num_classes=supervised_head_config.num_classes,
+            kernel_initializer=s_kernel_initializer,
+            kernel_regularizer=self._l2_regularizer)
+      else:
+        supervised_head = None
+      tasks[model_config.mode] = simclr_model.SimCLRModel(
+          input_specs=self._input_specs,
+          backbone=self._backbone,
+          projection_head=projection_head,
+          supervised_head=supervised_head,
+          mode=model_config.mode,
+          backbone_trainable=self._config.backbone_trainable)
+    return tasks
+  # TODO(huythong): Implement initialize function to load the pretrained
+  # checkpoint of backbone.
+  # def initialize(self):
--- a/official/vision/beta/projects/simclr/modeling/multitask_model_test.py
+++ b/official/vision/beta/projects/simclr/modeling/multitask_model_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for multitask_model."""
+import os.path
+import tensorflow as tf
+from official.vision.beta.projects.simclr.configs import multitask_config
+from official.vision.beta.projects.simclr.modeling import multitask_model
+from official.vision.beta.projects.simclr.modeling import simclr_model
+class MultitaskModelTest(tf.test.TestCase):
+  def test_initialize_model_success(self):
+    ckpt_dir = self.get_temp_dir()
+    config = multitask_config.SimCLRMTModelConfig(
+        input_size=[64, 64, 3],
+        heads=(multitask_config.SimCLRMTHeadConfig(mode=simclr_model.PRETRAIN),
+               multitask_config.SimCLRMTHeadConfig(mode=simclr_model.FINETUNE)))
+    model = multitask_model.SimCLRMTModel(config)
+    self.assertIn(simclr_model.PRETRAIN, model.sub_tasks)
+    self.assertIn(simclr_model.FINETUNE, model.sub_tasks)
+    ckpt = tf.train.Checkpoint(backbone=model._backbone)
+    ckpt.save(os.path.join(ckpt_dir, 'ckpt'))
+    model.initialize()
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/simclr/multitask_train.py
+++ b/official/vision/beta/projects/simclr/multitask_train.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer binary for multitask simclr."""
+from absl import app
+from absl import flags
+import gin
+from official.common import distribute_utils
+from official.common import flags as tfm_flags
+from official.core import train_utils
+from official.modeling import performance
+from official.modeling.multitask import multitask
+from official.modeling.multitask import train_lib
+# pylint: disable=unused-import
+from official.vision.beta.projects.simclr.common import registry_imports
+from official.vision.beta.projects.simclr.configs import multitask_config
+from official.vision.beta.projects.simclr.modeling import multitask_model
+# pylint: enable=unused-import
+FLAGS = flags.FLAGS
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  params = train_utils.parse_configuration(FLAGS)
+  model_dir = FLAGS.model_dir
+  if 'train' in FLAGS.mode:
+    # Pure eval modes do not output yaml files. Otherwise continuous eval job
+    # may race against the train job for writing the same file.
+    train_utils.serialize_config(params, model_dir)
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
+  distribution_strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+  with distribution_strategy.scope():
+    tasks = multitask.MultiTask.from_config(params.task)
+    model = multitask_model.SimCLRMTModel(params.task.model)
+  train_lib.run_experiment(
+      distribution_strategy=distribution_strategy,
+      task=tasks,
+      model=model,
+      mode=FLAGS.mode,
+      params=params,
+      model_dir=model_dir)
+  train_utils.save_gin_config(FLAGS.mode, model_dir)
+if __name__ == '__main__':
+  tfm_flags.define_flags()
+  app.run(main)
--- a/official/vision/beta/projects/vit/modeling/vit.py
+++ b/official/vision/beta/projects/vit/modeling/vit.py
@@ -59,6 +59,12 @@ VIT_SPECS = {
            patch_size=14,
            transformer=dict(mlp_dim=5120, num_heads=16, num_layers=32),
        ),
+    'vit-g14':
+        dict(
+            hidden_size=1664,
+            patch_size=14,
+            transformer=dict(mlp_dim=8192, num_heads=16, num_layers=48),
+        ),
 }

--- a/official/vision/beta/tasks/maskrcnn.py
+++ b/official/vision/beta/tasks/maskrcnn.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """RetinaNet task definition."""
+import os
 from typing import Any, Optional, List, Tuple, Mapping
 from absl import logging
@@ -26,6 +27,7 @@ from official.vision.beta.dataloaders import maskrcnn_input
 from official.vision.beta.dataloaders import tf_example_decoder
 from official.vision.beta.dataloaders import tf_example_label_map_decoder
 from official.vision.beta.evaluation import coco_evaluator
+from official.vision.beta.evaluation import coco_utils
 from official.vision.beta.losses import maskrcnn_losses
 from official.vision.beta.modeling import factory
@@ -259,10 +261,33 @@ class MaskRCNNTask(base_task.Task):
        metrics.append(tf.keras.metrics.Mean(name, dtype=tf.float32))
    else:
-      self.coco_metric = coco_evaluator.COCOEvaluator(
+      if self._task_config.annotation_file:
-          annotation_file=self._task_config.annotation_file,
+        self.coco_metric = coco_evaluator.COCOEvaluator(
-          include_mask=self._task_config.model.include_mask,
+            annotation_file=self._task_config.annotation_file,
-          per_category_metrics=self._task_config.per_category_metrics)
+            include_mask=self._task_config.model.include_mask,
+            per_category_metrics=self._task_config.per_category_metrics)
+      else:
+        annotation_path = os.path.join(self._logging_dir, 'annotation.json')
+        if tf.io.gfile.exists(annotation_path):
+          logging.info(
+              'annotation.json file exists, skipping creating the annotation'
+              ' file.')
+        else:
+          if self._task_config.validation_data.num_examples <= 0:
+            logging.info('validation_data.num_examples needs to be > 0')
+          if not self._task_config.validation_data.input_path:
+            logging.info('Can not create annotation file for tfds.')
+          logging.info(
+              'Creating coco-style annotation file: %s', annotation_path)
+          coco_utils.scan_and_generator_annotation_file(
+              self._task_config.validation_data.input_path,
+              self._task_config.validation_data.file_type,
+              self._task_config.validation_data.num_examples,
+              self.task_config.model.include_mask, annotation_path)
+        self.coco_metric = coco_evaluator.COCOEvaluator(
+            annotation_file=annotation_path,
+            include_mask=self._task_config.model.include_mask,
+            per_category_metrics=self._task_config.per_category_metrics)
    return metrics

--- a/orbit/controller.py
+++ b/orbit/controller.py
@@ -446,14 +446,13 @@ class Controller:
          f"{num_steps}. Old value was {current_step}, expected updated value "
          f"to be {expected_step}, but it was {self.global_step.numpy()}.")
      logging.warning(message)
-      return
    train_output = train_output or {}
    for action in self.train_actions:
      action(train_output)
    train_output = tf.nest.map_structure(utils.get_value, train_output)
-    current_step = expected_step
+    current_step = self.global_step.numpy()
    steps_per_second = self.step_timer.steps_per_second()
    _log(f"train | step: {current_step: 6d} | "
         f"steps/sec: {steps_per_second: 6.1f} | "