Merge branch 'master' of https://github.com/tensorflow/models

78c43ef1 · Gunho Park · 67cfc95b · e3c7e300 · 78c43ef1 · 78c43ef1
Commit 78c43ef1 authored Jul 26, 2021 by Gunho Park
20 changed files
--- a/official/vision/beta/projects/movinet/tools/convert_3d_2plus1d.py
+++ b/official/vision/beta/projects/movinet/tools/convert_3d_2plus1d.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Converts '3d_2plus1d' checkpoints into '2plus1d'."""
+
+from absl import app
+from absl import flags
+import tensorflow as tf
+
+from official.vision.beta.projects.movinet.modeling import movinet
+from official.vision.beta.projects.movinet.modeling import movinet_model
+
+flags.DEFINE_string(
+    'input_checkpoint_path', None,
+    'Checkpoint path to load.')
+flags.DEFINE_string(
+    'output_checkpoint_path', None,
+    'Export path to save the saved_model file.')
+flags.DEFINE_string(
+    'model_id', 'a0', 'MoViNet model name.')
+flags.DEFINE_bool(
+    'causal', False, 'Run the model in causal mode.')
+flags.DEFINE_bool(
+    'use_positional_encoding', False,
+    'Whether to use positional encoding (only applied when causal=True).')
+flags.DEFINE_integer(
+    'num_classes', 600, 'The number of classes for prediction.')
+flags.DEFINE_bool(
+    'verify_output', False, 'Verify the output matches between the models.')
+
+FLAGS = flags.FLAGS
+
+
+def main(_) -> None:
+  backbone_2plus1d = movinet.Movinet(
+      model_id=FLAGS.model_id,
+      causal=FLAGS.causal,
+      conv_type='2plus1d',
+      use_positional_encoding=FLAGS.use_positional_encoding)
+  model_2plus1d = movinet_model.MovinetClassifier(
+      backbone=backbone_2plus1d,
+      num_classes=FLAGS.num_classes)
+  model_2plus1d.build([1, 1, 1, 1, 3])
+
+  backbone_3d_2plus1d = movinet.Movinet(
+      model_id=FLAGS.model_id,
+      causal=FLAGS.causal,
+      conv_type='3d_2plus1d',
+      use_positional_encoding=FLAGS.use_positional_encoding)
+  model_3d_2plus1d = movinet_model.MovinetClassifier(
+      backbone=backbone_3d_2plus1d,
+      num_classes=FLAGS.num_classes)
+  model_3d_2plus1d.build([1, 1, 1, 1, 3])
+
+  checkpoint = tf.train.Checkpoint(model=model_3d_2plus1d)
+  status = checkpoint.restore(FLAGS.input_checkpoint_path)
+  status.assert_existing_objects_matched()
+
+  # Ensure both models have the same weights
+  weights = []
+  for var_2plus1d, var_3d_2plus1d in zip(
+      model_2plus1d.get_weights(), model_3d_2plus1d.get_weights()):
+    if var_2plus1d.shape == var_3d_2plus1d.shape:
+      weights.append(var_3d_2plus1d)
+    else:
+      if var_3d_2plus1d.shape[0] == 1:
+        weight = var_3d_2plus1d[0]
+      else:
+        weight = var_3d_2plus1d[:, 0]
+      if weight.shape[-1] != var_2plus1d.shape[-1]:
+        # Transpose any depthwise kernels (conv3d --> depthwise_conv2d)
+        weight = tf.transpose(weight, perm=(0, 1, 3, 2))
+      weights.append(weight)
+  model_2plus1d.set_weights(weights)
+
+  if FLAGS.verify_output:
+    inputs = tf.random.uniform([1, 6, 64, 64, 3], dtype=tf.float32)
+
+    logits_2plus1d = model_2plus1d(inputs)
+    logits_3d_2plus1d = model_3d_2plus1d(inputs)
+
+    if tf.reduce_mean(logits_2plus1d - logits_3d_2plus1d) > 1e-5:
+      raise ValueError('Bad conversion, model outputs do not match.')
+
+  save_checkpoint = tf.train.Checkpoint(
+      model=model_2plus1d, backbone=backbone_2plus1d)
+  save_checkpoint.save(FLAGS.output_checkpoint_path)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('input_checkpoint_path')
+  flags.mark_flag_as_required('output_checkpoint_path')
+  app.run(main)
--- a/official/vision/beta/projects/movinet/tools/convert_3d_2plus1d_test.py
+++ b/official/vision/beta/projects/movinet/tools/convert_3d_2plus1d_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for convert_3d_2plus1d."""
+
+import os
+
+from absl import flags
+import tensorflow as tf
+
+from official.vision.beta.projects.movinet.modeling import movinet
+from official.vision.beta.projects.movinet.modeling import movinet_model
+from official.vision.beta.projects.movinet.tools import convert_3d_2plus1d
+
+FLAGS = flags.FLAGS
+
+
+class Convert3d2plus1dTest(tf.test.TestCase):
+
+  def test_convert_model(self):
+    saved_model_path = self.get_temp_dir()
+    input_checkpoint_path = os.path.join(saved_model_path, 'ckpt-input')
+    output_checkpoint_path = os.path.join(saved_model_path, 'ckpt')
+
+    model_3d_2plus1d = movinet_model.MovinetClassifier(
+        backbone=movinet.Movinet(
+            model_id='a0',
+            conv_type='3d_2plus1d'),
+        num_classes=600)
+    model_3d_2plus1d.build([1, 1, 1, 1, 3])
+    save_checkpoint = tf.train.Checkpoint(model=model_3d_2plus1d)
+    save_checkpoint.save(input_checkpoint_path)
+
+    FLAGS.input_checkpoint_path = f'{input_checkpoint_path}-1'
+    FLAGS.output_checkpoint_path = output_checkpoint_path
+    FLAGS.model_id = 'a0'
+    FLAGS.use_positional_encoding = False
+    FLAGS.num_classes = 600
+    FLAGS.verify_output = True
+
+    convert_3d_2plus1d.main('unused_args')
+
+    print(os.listdir(saved_model_path))
+
+    self.assertTrue(tf.io.gfile.exists(f'{output_checkpoint_path}-1.index'))
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/movinet/train.py
+++ b/official/vision/beta/projects/movinet/train.py
@@ -46,6 +46,7 @@ from official.modeling import performance
 # Import movinet libraries to register the backbone and model into tf.vision
 # model garden factory.
 # pylint: disable=unused-import
+# the followings are the necessary imports.
 from official.vision.beta.projects.movinet.modeling import movinet
 from official.vision.beta.projects.movinet.modeling import movinet_model
 # pylint: enable=unused-import

--- a/official/vision/beta/projects/panoptic_maskrcnn/README.md
+++ b/official/vision/beta/projects/panoptic_maskrcnn/README.md
+# Panoptic Segmentation
+
+## Description
+
+Panoptic Segmentation combines the two distinct vision tasks - semantic
+segmentation and instance segmentation. These tasks are unified such that, each
+pixel in the image is assigned the label of the class it belongs to, and also
+the instance identifier of the object it a part of.
+
+## Environment setup
+The code can be run on multiple GPUs or TPUs with different distribution
+strategies. See the TensorFlow distributed training
+[guide](https://www.tensorflow.org/guide/distributed_training) for an overview
+of `tf.distribute`.
+
+The code is compatible with TensorFlow 2.4+. See requirements.txt for all
+prerequisites, and you can also install them using the following command. `pip
+install -r ./official/requirements.txt`
+
+**DISCLAIMER**: Panoptic MaskRCNN is still under active development, stay tuned!
--- a/official/vision/beta/projects/panoptic_maskrcnn/__init__.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_maskrcnn.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Panoptic Mask R-CNN configuration definition."""
+from typing import List
+
+import dataclasses
+from official.vision.beta.configs import maskrcnn
+from official.vision.beta.configs import semantic_segmentation
+
+
+@dataclasses.dataclass
+class Parser(maskrcnn.Parser):
+  """Panoptic Segmentation parser config."""
+  # If resize_eval_groundtruth is set to False, original image sizes are used
+  # for eval. In that case, groundtruth_padded_size has to be specified too to
+  # allow for batching the variable input sizes of images.
+  resize_eval_segmentation_groundtruth: bool = True
+  segmentation_groundtruth_padded_size: List[int] = dataclasses.field(
+      default_factory=list)
+  segmentation_ignore_label: int = 255
+
+
+@dataclasses.dataclass
+class DataConfig(maskrcnn.DataConfig):
+  """Input config for training."""
+  parser: Parser = Parser()
+
+
+@dataclasses.dataclass
+class PanopticMaskRCNN(maskrcnn.MaskRCNN):
+  """Panoptic Mask R-CNN model config."""
+  segmentation_model: semantic_segmentation.SemanticSegmentationModel = (
+      semantic_segmentation.SemanticSegmentationModel(num_classes=2))
+  shared_backbone: bool = True
+  shared_decoder: bool = True
--- a/official/vision/beta/projects/panoptic_maskrcnn/dataloaders/panoptic_maskrcnn_input.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/dataloaders/panoptic_maskrcnn_input.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data parser and processing for Panoptic Mask R-CNN."""
+
+import tensorflow as tf
+
+from official.vision.beta.dataloaders import maskrcnn_input
+from official.vision.beta.dataloaders import tf_example_decoder
+from official.vision.beta.ops import preprocess_ops
+
+
+class TfExampleDecoder(tf_example_decoder.TfExampleDecoder):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self, regenerate_source_id, mask_binarize_threshold):
+    super(TfExampleDecoder, self).__init__(
+        include_mask=True,
+        regenerate_source_id=regenerate_source_id,
+        mask_binarize_threshold=None)
+    self._segmentation_keys_to_features = {
+        'image/segmentation/class/encoded':
+            tf.io.FixedLenFeature((), tf.string, default_value='')
+    }
+
+  def decode(self, serialized_example):
+    decoded_tensors = super(TfExampleDecoder, self).decode(serialized_example)
+    segmentation_parsed_tensors = tf.io.parse_single_example(
+        serialized_example, self._segmentation_keys_to_features)
+    segmentation_mask = tf.io.decode_image(
+        segmentation_parsed_tensors['image/segmentation/class/encoded'],
+        channels=1)
+    segmentation_mask.set_shape([None, None, 1])
+    decoded_tensors.update({'groundtruth_segmentation_mask': segmentation_mask})
+    return decoded_tensors
+
+
+class Parser(maskrcnn_input.Parser):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+
+  def __init__(self,
+               output_size,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               rpn_match_threshold=0.7,
+               rpn_unmatched_threshold=0.3,
+               rpn_batch_size_per_im=256,
+               rpn_fg_fraction=0.5,
+               aug_rand_hflip=False,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               skip_crowd_during_training=True,
+               max_num_instances=100,
+               mask_crop_size=112,
+               segmentation_resize_eval_groundtruth=True,
+               segmentation_groundtruth_padded_size=None,
+               segmentation_ignore_label=255,
+               dtype='float32'):
+    """Initializes parameters for parsing annotations in the dataset.
+
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      min_level: `int` number of minimum level of the output feature pyramid.
+      max_level: `int` number of maximum level of the output feature pyramid.
+      num_scales: `int` number representing intermediate scales added
+        on each level. For instance, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: `list` of float numbers representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instance, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: `float` number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      rpn_match_threshold: `float`, match threshold for anchors in RPN.
+      rpn_unmatched_threshold: `float`, unmatched threshold for anchors in RPN.
+      rpn_batch_size_per_im: `int` for batch size per image in RPN.
+      rpn_fg_fraction: `float` for forground fraction per batch in RPN.
+      aug_rand_hflip: `bool`, if True, augment training with random
+        horizontal flip.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
+        `is_crowd` equals to 1.
+      max_num_instances: `int` number of maximum number of instances in an
+        image. The groundtruth data will be padded to `max_num_instances`.
+      mask_crop_size: the size which groundtruth mask is cropped to.
+      segmentation_resize_eval_groundtruth: `bool`, if True, eval groundtruth
+        masks are resized to output_size.
+      segmentation_groundtruth_padded_size: `Tensor` or `list` for [height,
+        width]. When resize_eval_groundtruth is set to False, the groundtruth
+        masks are padded to this size.
+      segmentation_ignore_label: `int` the pixel with ignore label will not used
+        for training and evaluation.
+      dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}.
+    """
+    super(Parser, self).__init__(
+        output_size=output_size,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size,
+        rpn_match_threshold=rpn_match_threshold,
+        rpn_unmatched_threshold=rpn_unmatched_threshold,
+        rpn_batch_size_per_im=rpn_batch_size_per_im,
+        rpn_fg_fraction=rpn_fg_fraction,
+        aug_rand_hflip=False,
+        aug_scale_min=aug_scale_min,
+        aug_scale_max=aug_scale_max,
+        skip_crowd_during_training=skip_crowd_during_training,
+        max_num_instances=max_num_instances,
+        include_mask=True,
+        mask_crop_size=mask_crop_size,
+        dtype=dtype)
+
+    self.aug_rand_hflip = aug_rand_hflip
+    self._segmentation_resize_eval_groundtruth = segmentation_resize_eval_groundtruth
+    if (not segmentation_resize_eval_groundtruth) and (
+        segmentation_groundtruth_padded_size is None):
+      raise ValueError(
+          'segmentation_groundtruth_padded_size ([height, width]) needs to be'
+          'specified when resize_eval_segmentation_groundtruth is False.')
+    self._segmentation_groundtruth_padded_size = segmentation_groundtruth_padded_size
+    self._segmentation_ignore_label = segmentation_ignore_label
+
+  def _parse_train_data(self, data):
+    """Parses data for training.
+
+    Args:
+      data: the decoded tensor dictionary from TfExampleDecoder.
+
+    Returns:
+      image: image tensor that is preproessed to have normalized value and
+        dimension [output_size[0], output_size[1], 3]
+      labels: a dictionary of tensors used for training. The following describes
+        {key: value} pairs in the dictionary.
+        image_info: a 2D `Tensor` that encodes the information of the image and
+          the applied preprocessing. It is in the format of
+          [[original_height, original_width], [scaled_height, scaled_width]],
+        anchor_boxes: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, 4] representing anchor boxes at each level.
+        rpn_score_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location]. The height_l and
+          width_l represent the dimension of class logits at l-th level.
+        rpn_box_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
+          width_l represent the dimension of bounding box regression output at
+          l-th level.
+        gt_boxes: Groundtruth bounding box annotations. The box is represented
+           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
+           image that is fed to the network. The tennsor is padded with -1 to
+           the fixed dimension [self._max_num_instances, 4].
+        gt_classes: Groundtruth classes annotations. The tennsor is padded
+          with -1 to the fixed dimension [self._max_num_instances].
+        gt_masks: Groundtruth masks cropped by the bounding box and
+          resized to a fixed size determined by mask_crop_size.
+        gt_segmentation_mask: Groundtruth mask for segmentation head, this is
+          resized to a fixed size determined by output_size.
+        gt_segmentation_valid_mask: Binary mask that marks the pixels that
+          are supposed to be used in computing the segmentation loss while
+          training.
+    """
+    segmentation_mask = data['groundtruth_segmentation_mask']
+
+    # Flips image randomly during training.
+    if self.aug_rand_hflip:
+      masks = data['groundtruth_instance_masks']
+      image_mask = tf.concat([data['image'], segmentation_mask], axis=2)
+
+      image_mask, boxes, masks = preprocess_ops.random_horizontal_flip(
+          image_mask, data['groundtruth_boxes'], masks)
+
+      segmentation_mask = image_mask[:, :, -1:]
+      image = image_mask[:, :, :-1]
+
+      data['image'] = image
+      data['boxes'] = boxes
+      data['masks'] = masks
+
+    image, labels = super(Parser, self)._parse_train_data(data)
+
+    image_info = labels['image_info']
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+
+    segmentation_mask = tf.reshape(
+        segmentation_mask, shape=[1, data['height'], data['width']])
+    segmentation_mask = tf.cast(segmentation_mask, tf.float32)
+
+    # Pad label and make sure the padded region assigned to the ignore label.
+    # The label is first offset by +1 and then padded with 0.
+    segmentation_mask += 1
+    segmentation_mask = tf.expand_dims(segmentation_mask, axis=3)
+    segmentation_mask = preprocess_ops.resize_and_crop_masks(
+        segmentation_mask, image_scale, self._output_size, offset)
+    segmentation_mask -= 1
+    segmentation_mask = tf.where(
+        tf.equal(segmentation_mask, -1),
+        self._segmentation_ignore_label * tf.ones_like(segmentation_mask),
+        segmentation_mask)
+    segmentation_mask = tf.squeeze(segmentation_mask, axis=0)
+    segmentation_valid_mask = tf.not_equal(
+        segmentation_mask, self._segmentation_ignore_label)
+
+    labels.update({
+        'gt_segmentation_mask': segmentation_mask,
+        'gt_segmentation_valid_mask': segmentation_valid_mask})
+
+    return image, labels
+
+  def _parse_eval_data(self, data):
+    """Parses data for evaluation.
+
+    Args:
+      data: the decoded tensor dictionary from TfExampleDecoder.
+
+    Returns:
+      A dictionary of {'images': image, 'labels': labels} where
+        image: image tensor that is preproessed to have normalized value and
+          dimension [output_size[0], output_size[1], 3]
+        labels: a dictionary of tensors used for training. The following
+          describes {key: value} pairs in the dictionary.
+          source_ids: Source image id. Default value -1 if the source id is
+            empty in the groundtruth annotation.
+          image_info: a 2D `Tensor` that encodes the information of the image
+            and the applied preprocessing. It is in the format of
+            [[original_height, original_width], [scaled_height, scaled_width]],
+          anchor_boxes: ordered dictionary with keys
+            [min_level, min_level+1, ..., max_level]. The values are tensor with
+            shape [height_l, width_l, 4] representing anchor boxes at each
+            level.
+    """
+    segmentation_mask = tf.cast(
+        data['groundtruth_segmentation_mask'], tf.float32)
+    segmentation_mask = tf.reshape(
+        segmentation_mask, shape=[1, data['height'], data['width'], 1])
+    segmentation_mask += 1
+
+    image, labels = super(Parser, self)._parse_eval_data(data)
+
+    if self._segmentation_resize_eval_groundtruth:
+      # Resizes eval masks to match input image sizes. In that case, mean IoU
+      # is computed on output_size not the original size of the images.
+      image_info = labels['image_info']
+      image_scale = image_info[2, :]
+      offset = image_info[3, :]
+      segmentation_mask = preprocess_ops.resize_and_crop_masks(
+          segmentation_mask, image_scale, self._output_size, offset)
+    else:
+      segmentation_mask = tf.image.pad_to_bounding_box(
+          segmentation_mask, 0, 0,
+          self._segmentation_groundtruth_padded_size[0],
+          self._segmentation_groundtruth_padded_size[1])
+
+    segmentation_mask -= 1
+    # Assign ignore label to the padded region.
+    segmentation_mask = tf.where(
+        tf.equal(segmentation_mask, -1),
+        self._segmentation_ignore_label * tf.ones_like(segmentation_mask),
+        segmentation_mask)
+    segmentation_mask = tf.squeeze(segmentation_mask, axis=0)
+    segmentation_valid_mask = tf.not_equal(
+        segmentation_mask, self._segmentation_ignore_label)
+
+    labels['groundtruths'].update({
+        'gt_segmentation_mask': segmentation_mask,
+        'gt_segmentation_valid_mask': segmentation_valid_mask})
+    return image, labels
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Factory method to build panoptic segmentation model."""
+
+import tensorflow as tf
+
+from official.vision.beta.modeling import backbones
+from official.vision.beta.modeling import factory as models_factory
+from official.vision.beta.modeling.decoders import factory as decoder_factory
+from official.vision.beta.modeling.heads import segmentation_heads
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn as panoptic_maskrcnn_cfg
+from official.vision.beta.projects.panoptic_maskrcnn.modeling import panoptic_maskrcnn_model
+
+
+def build_panoptic_maskrcnn(
+    input_specs: tf.keras.layers.InputSpec,
+    model_config: panoptic_maskrcnn_cfg.PanopticMaskRCNN,
+    l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
+  """Builds Panoptic Mask R-CNN model.
+
+  This factory function builds the mask rcnn first, builds the non-shared
+  semantic segmentation layers, and finally combines the two models to form
+  the panoptic segmentation model.
+
+  Args:
+    input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
+    model_config: Config instance for the panoptic maskrcnn model.
+    l2_regularizer: Optional `tf.keras.regularizers.Regularizer`, if specified,
+      the model is built with the provided regularization layer.
+  Returns:
+    tf.keras.Model for the panoptic segmentation model.
+  """
+  norm_activation_config = model_config.norm_activation
+  segmentation_config = model_config.segmentation_model
+
+  # Builds the maskrcnn model.
+  maskrcnn_model = models_factory.build_maskrcnn(
+      input_specs=input_specs,
+      model_config=model_config,
+      l2_regularizer=l2_regularizer)
+
+  # Builds the semantic segmentation branch.
+  if not model_config.shared_backbone:
+    segmentation_backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=segmentation_config.backbone,
+        norm_activation_config=norm_activation_config,
+        l2_regularizer=l2_regularizer)
+    segmentation_decoder_input_specs = segmentation_backbone.output_specs
+  else:
+    segmentation_backbone = None
+    segmentation_decoder_input_specs = maskrcnn_model.backbone.output_specs
+
+  if not model_config.shared_decoder:
+    segmentation_decoder = decoder_factory.build_decoder(
+        input_specs=segmentation_decoder_input_specs,
+        model_config=segmentation_config,
+        l2_regularizer=l2_regularizer)
+  else:
+    segmentation_decoder = None
+
+  segmentation_head_config = segmentation_config.head
+  detection_head_config = model_config.detection_head
+
+  segmentation_head = segmentation_heads.SegmentationHead(
+      num_classes=segmentation_config.num_classes,
+      level=segmentation_head_config.level,
+      num_convs=segmentation_head_config.num_convs,
+      prediction_kernel_size=segmentation_head_config.prediction_kernel_size,
+      num_filters=segmentation_head_config.num_filters,
+      upsample_factor=segmentation_head_config.upsample_factor,
+      feature_fusion=segmentation_head_config.feature_fusion,
+      low_level=segmentation_head_config.low_level,
+      low_level_num_filters=segmentation_head_config.low_level_num_filters,
+      activation=norm_activation_config.activation,
+      use_sync_bn=norm_activation_config.use_sync_bn,
+      norm_momentum=norm_activation_config.norm_momentum,
+      norm_epsilon=norm_activation_config.norm_epsilon,
+      kernel_regularizer=l2_regularizer)
+
+  # Combines maskrcnn, and segmentation models to build panoptic segmentation
+  # model.
+  model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
+      backbone=maskrcnn_model.backbone,
+      decoder=maskrcnn_model.decoder,
+      rpn_head=maskrcnn_model.rpn_head,
+      detection_head=maskrcnn_model.detection_head,
+      roi_generator=maskrcnn_model.roi_generator,
+      roi_sampler=maskrcnn_model.roi_sampler,
+      roi_aligner=maskrcnn_model.roi_aligner,
+      detection_generator=maskrcnn_model.detection_generator,
+      mask_head=maskrcnn_model.mask_head,
+      mask_sampler=maskrcnn_model.mask_sampler,
+      mask_roi_aligner=maskrcnn_model.mask_roi_aligner,
+      segmentation_backbone=segmentation_backbone,
+      segmentation_decoder=segmentation_decoder,
+      segmentation_head=segmentation_head,
+      class_agnostic_bbox_pred=detection_head_config.class_agnostic_bbox_pred,
+      cascade_class_ensemble=detection_head_config.cascade_class_ensemble,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      num_scales=model_config.anchor.num_scales,
+      aspect_ratios=model_config.anchor.aspect_ratios,
+      anchor_size=model_config.anchor.anchor_size)
+  return model
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/factory_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for factory.py."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+from official.vision.beta.configs import backbones
+from official.vision.beta.configs import decoders
+from official.vision.beta.configs import semantic_segmentation
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_maskrcnn as panoptic_maskrcnn_cfg
+from official.vision.beta.projects.panoptic_maskrcnn.modeling import factory
+
+
+class PanopticMaskRCNNBuilderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      ('resnet', (640, 640), 'dilated_resnet', 'fpn'),
+      ('resnet', (640, 640), 'dilated_resnet', 'aspp'),
+      ('resnet', (640, 640), None, 'fpn'),
+      ('resnet', (640, 640), None, 'aspp'),
+      ('resnet', (640, 640), None, None),
+      ('resnet', (None, None), 'dilated_resnet', 'fpn'),
+      ('resnet', (None, None), 'dilated_resnet', 'aspp'),
+      ('resnet', (None, None), None, 'fpn'),
+      ('resnet', (None, None), None, 'aspp'),
+      ('resnet', (None, None), None, None)
+  )
+  def test_builder(self, backbone_type, input_size, segmentation_backbone_type,
+                   segmentation_decoder_type):
+    num_classes = 2
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[None, input_size[0], input_size[1], 3])
+    segmentation_output_stride = 16
+    level = int(np.math.log2(segmentation_output_stride))
+    segmentation_model = semantic_segmentation.SemanticSegmentationModel(
+        num_classes=2,
+        backbone=backbones.Backbone(type=segmentation_backbone_type),
+        decoder=decoders.Decoder(type=segmentation_decoder_type),
+        head=semantic_segmentation.SegmentationHead(level=level))
+    model_config = panoptic_maskrcnn_cfg.PanopticMaskRCNN(
+        num_classes=num_classes,
+        segmentation_model=segmentation_model,
+        backbone=backbones.Backbone(type=backbone_type),
+        shared_backbone=segmentation_backbone_type is None,
+        shared_decoder=segmentation_decoder_type is None)
+    l2_regularizer = tf.keras.regularizers.l2(5e-5)
+    _ = factory.build_panoptic_maskrcnn(
+        input_specs=input_specs,
+        model_config=model_config,
+        l2_regularizer=l2_regularizer)
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Panoptic Segmentation model."""
+
+from typing import List, Mapping, Optional, Union
+
+import tensorflow as tf
+
+from official.vision.beta.modeling import maskrcnn_model
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class PanopticMaskRCNNModel(maskrcnn_model.MaskRCNNModel):
+  """The Panoptic Segmentation model."""
+
+  def __init__(self,
+               backbone: tf.keras.Model,
+               decoder: tf.keras.Model,
+               rpn_head: tf.keras.layers.Layer,
+               detection_head: Union[tf.keras.layers.Layer,
+                                     List[tf.keras.layers.Layer]],
+               roi_generator: tf.keras.layers.Layer,
+               roi_sampler: Union[tf.keras.layers.Layer,
+                                  List[tf.keras.layers.Layer]],
+               roi_aligner: tf.keras.layers.Layer,
+               detection_generator: tf.keras.layers.Layer,
+               mask_head: Optional[tf.keras.layers.Layer] = None,
+               mask_sampler: Optional[tf.keras.layers.Layer] = None,
+               mask_roi_aligner: Optional[tf.keras.layers.Layer] = None,
+               segmentation_backbone: Optional[tf.keras.Model] = None,
+               segmentation_decoder: Optional[tf.keras.Model] = None,
+               segmentation_head: tf.keras.layers.Layer = None,
+               class_agnostic_bbox_pred: bool = False,
+               cascade_class_ensemble: bool = False,
+               min_level: Optional[int] = None,
+               max_level: Optional[int] = None,
+               num_scales: Optional[int] = None,
+               aspect_ratios: Optional[List[float]] = None,
+               anchor_size: Optional[float] = None,
+               **kwargs):
+    """Initializes the Panoptic Mask R-CNN model.
+
+    Args:
+      backbone: `tf.keras.Model`, the backbone network.
+      decoder: `tf.keras.Model`, the decoder network.
+      rpn_head: the RPN head.
+      detection_head: the detection head or a list of heads.
+      roi_generator: the ROI generator.
+      roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
+        detection heads.
+      roi_aligner: the ROI aligner.
+      detection_generator: the detection generator.
+      mask_head: the mask head.
+      mask_sampler: the mask sampler.
+      mask_roi_aligner: the ROI alginer for mask prediction.
+      segmentation_backbone: `tf.keras.Model`, the backbone network for the
+        segmentation head for panoptic task. Providing `segmentation_backbone`
+        will allow the segmentation head to use a standlone backbone. Setting
+        `segmentation_backbone=None` would enable backbone sharing between the
+        MaskRCNN model and segmentation head.
+      segmentation_decoder: `tf.keras.Model`, the decoder network for the
+        segmentation head for panoptic task. Providing `segmentation_decoder`
+        will allow the segmentation head to use a standlone decoder. Setting
+        `segmentation_decoder=None` would enable decoder sharing between the
+        MaskRCNN model and segmentation head. Decoders can only be shared when
+        `segmentation_backbone` is shared as well.
+      segmentation_head: segmentatation head for panoptic task.
+      class_agnostic_bbox_pred: if True, perform class agnostic bounding box
+        prediction. Needs to be `True` for Cascade RCNN models.
+      cascade_class_ensemble: if True, ensemble classification scores over all
+        detection heads.
+      min_level: Minimum level in output feature maps.
+      max_level: Maximum level in output feature maps.
+      num_scales: A number representing intermediate scales added on each level.
+        For instances, num_scales=2 adds one additional intermediate anchor
+        scales [2^0, 2^0.5] on each level.
+      aspect_ratios: A list representing the aspect raito anchors added on each
+        level. The number indicates the ratio of width to height. For instances,
+        aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
+      anchor_size: A number representing the scale of size of the base anchor to
+        the feature stride 2^level.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(PanopticMaskRCNNModel, self).__init__(
+        backbone=backbone,
+        decoder=decoder,
+        rpn_head=rpn_head,
+        detection_head=detection_head,
+        roi_generator=roi_generator,
+        roi_sampler=roi_sampler,
+        roi_aligner=roi_aligner,
+        detection_generator=detection_generator,
+        mask_head=mask_head,
+        mask_sampler=mask_sampler,
+        mask_roi_aligner=mask_roi_aligner,
+        class_agnostic_bbox_pred=class_agnostic_bbox_pred,
+        cascade_class_ensemble=cascade_class_ensemble,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size,
+        **kwargs)
+
+    self._config_dict.update({
+        'segmentation_backbone': segmentation_backbone,
+        'segmentation_decoder': segmentation_decoder,
+        'segmentation_head': segmentation_head
+    })
+
+    if not self._include_mask:
+      raise ValueError(
+          '`mask_head` needs to be provided for Panoptic Mask R-CNN.')
+    if segmentation_backbone is not None and segmentation_decoder is None:
+      raise ValueError(
+          '`segmentation_decoder` needs to be provided for Panoptic Mask R-CNN'
+          'if `backbone` is not shared.')
+
+    self.segmentation_backbone = segmentation_backbone
+    self.segmentation_decoder = segmentation_decoder
+    self.segmentation_head = segmentation_head
+
+  def call(self,
+           images: tf.Tensor,
+           image_shape: tf.Tensor,
+           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+           gt_boxes: Optional[tf.Tensor] = None,
+           gt_classes: Optional[tf.Tensor] = None,
+           gt_masks: Optional[tf.Tensor] = None,
+           training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+    model_outputs = super(PanopticMaskRCNNModel, self).call(
+        images=images,
+        image_shape=image_shape,
+        anchor_boxes=anchor_boxes,
+        gt_boxes=gt_boxes,
+        gt_classes=gt_classes,
+        gt_masks=gt_masks,
+        training=training)
+
+    if self.segmentation_backbone is not None:
+      backbone_features = self.segmentation_backbone(images, training=training)
+    else:
+      backbone_features = model_outputs['backbone_features']
+
+    if self.segmentation_decoder is not None:
+      decoder_features = self.segmentation_decoder(
+          backbone_features, training=training)
+    else:
+      decoder_features = model_outputs['decoder_features']
+
+    segmentation_outputs = self.segmentation_head(
+        backbone_features, decoder_features, training=training)
+
+    model_outputs.update({
+        'segmentation_outputs': segmentation_outputs,
+    })
+
+    return model_outputs
+
+  @property
+  def checkpoint_items(
+      self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = super(PanopticMaskRCNNModel, self).checkpoint_items
+    if self.segmentation_backbone is not None:
+      items.update(segmentation_backbone=self.segmentation_backbone)
+    if self.segmentation_decoder is not None:
+      items.update(segmentation_decoder=self.segmentation_decoder)
+    items.update(segmentation_head=self.segmentation_head)
+    return items
--- a/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/modeling/panoptic_maskrcnn_model_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for panoptic_maskrcnn_model.py."""
+
+import os
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.beta.modeling.backbones import resnet
+from official.vision.beta.modeling.decoders import aspp
+from official.vision.beta.modeling.decoders import fpn
+from official.vision.beta.modeling.heads import dense_prediction_heads
+from official.vision.beta.modeling.heads import instance_heads
+from official.vision.beta.modeling.heads import segmentation_heads
+from official.vision.beta.modeling.layers import detection_generator
+from official.vision.beta.modeling.layers import mask_sampler
+from official.vision.beta.modeling.layers import roi_aligner
+from official.vision.beta.modeling.layers import roi_generator
+from official.vision.beta.modeling.layers import roi_sampler
+from official.vision.beta.ops import anchor
+from official.vision.beta.projects.panoptic_maskrcnn.modeling import panoptic_maskrcnn_model
+
+
+class PanopticMaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          use_separable_conv=[True, False],
+          build_anchor_boxes=[True, False],
+          shared_backbone=[True, False],
+          shared_decoder=[True, False],
+          is_training=[True, False]))
+  def test_build_model(self,
+                       use_separable_conv,
+                       build_anchor_boxes,
+                       shared_backbone,
+                       shared_decoder,
+                       is_training=True):
+    num_classes = 3
+    min_level = 3
+    max_level = 7
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    resnet_model_id = 50
+    segmentation_resnet_model_id = 50
+    segmentation_output_stride = 16
+    aspp_dilation_rates = [6, 12, 18]
+    aspp_decoder_level = int(np.math.log2(segmentation_output_stride))
+    fpn_decoder_level = 3
+    num_anchors_per_location = num_scales * len(aspect_ratios)
+    image_size = 128
+    images = np.random.rand(2, image_size, image_size, 3)
+    image_shape = np.array([[image_size, image_size], [image_size, image_size]])
+    shared_decoder = shared_decoder and shared_backbone
+    if build_anchor_boxes:
+      anchor_boxes = anchor.Anchor(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=3,
+          image_size=(image_size, image_size)).multilevel_boxes
+      for l in anchor_boxes:
+        anchor_boxes[l] = tf.tile(
+            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
+    else:
+      anchor_boxes = None
+
+    backbone = resnet.ResNet(model_id=resnet_model_id)
+    decoder = fpn.FPN(
+        input_specs=backbone.output_specs,
+        min_level=min_level,
+        max_level=max_level,
+        use_separable_conv=use_separable_conv)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=min_level,
+        max_level=max_level,
+        num_anchors_per_location=num_anchors_per_location,
+        num_convs=1)
+    detection_head = instance_heads.DetectionHead(num_classes=num_classes)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    mask_head = instance_heads.MaskHead(
+        num_classes=num_classes, upsample_factor=2)
+    mask_sampler_obj = mask_sampler.MaskSampler(
+        mask_target_size=28, num_sampled_masks=1)
+    mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+
+    if shared_backbone:
+      segmentation_backbone = None
+    else:
+      segmentation_backbone = resnet.ResNet(
+          model_id=segmentation_resnet_model_id)
+    if not shared_decoder:
+      level = aspp_decoder_level
+      segmentation_decoder = aspp.ASPP(
+          level=level, dilation_rates=aspp_dilation_rates)
+    else:
+      level = fpn_decoder_level
+      segmentation_decoder = None
+    segmentation_head = segmentation_heads.SegmentationHead(
+        num_classes=2,  # stuff and common class for things,
+        level=level,
+        num_convs=2)
+
+    model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        segmentation_backbone=segmentation_backbone,
+        segmentation_decoder=segmentation_decoder,
+        segmentation_head=segmentation_head,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size)
+
+    gt_boxes = np.array(
+        [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+         [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+        dtype=np.float32)
+    gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+    gt_masks = np.ones((2, 3, 100, 100))
+
+    # Results will be checked in test_forward.
+    _ = model(
+        images,
+        image_shape,
+        anchor_boxes,
+        gt_boxes,
+        gt_classes,
+        gt_masks,
+        training=is_training)
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          shared_backbone=[True, False],
+          shared_decoder=[True, False],
+          training=[True, False],
+      ))
+  def test_forward(self, strategy, training,
+                   shared_backbone, shared_decoder):
+    num_classes = 3
+    min_level = 3
+    max_level = 4
+    num_scales = 3
+    aspect_ratios = [1.0]
+    anchor_size = 3
+    segmentation_resnet_model_id = 101
+    segmentation_output_stride = 16
+    aspp_dilation_rates = [6, 12, 18]
+    aspp_decoder_level = int(np.math.log2(segmentation_output_stride))
+    fpn_decoder_level = 3
+
+    class_agnostic_bbox_pred = False
+    cascade_class_ensemble = False
+
+    image_size = (256, 256)
+    images = np.random.rand(2, image_size[0], image_size[1], 3)
+    image_shape = np.array([[224, 100], [100, 224]])
+    shared_decoder = shared_decoder and shared_backbone
+    with strategy.scope():
+
+      anchor_boxes = anchor.Anchor(
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=anchor_size,
+          image_size=image_size).multilevel_boxes
+
+      num_anchors_per_location = len(aspect_ratios) * num_scales
+
+      input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+      backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+      decoder = fpn.FPN(
+          min_level=min_level,
+          max_level=max_level,
+          input_specs=backbone.output_specs)
+      rpn_head = dense_prediction_heads.RPNHead(
+          min_level=min_level,
+          max_level=max_level,
+          num_anchors_per_location=num_anchors_per_location)
+      detection_head = instance_heads.DetectionHead(
+          num_classes=num_classes,
+          class_agnostic_bbox_pred=class_agnostic_bbox_pred)
+      roi_generator_obj = roi_generator.MultilevelROIGenerator()
+
+      roi_sampler_cascade = []
+      roi_sampler_obj = roi_sampler.ROISampler()
+      roi_sampler_cascade.append(roi_sampler_obj)
+      roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+      detection_generator_obj = detection_generator.DetectionGenerator()
+      mask_head = instance_heads.MaskHead(
+          num_classes=num_classes, upsample_factor=2)
+      mask_sampler_obj = mask_sampler.MaskSampler(
+          mask_target_size=28, num_sampled_masks=1)
+      mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+
+      if shared_backbone:
+        segmentation_backbone = None
+      else:
+        segmentation_backbone = resnet.ResNet(
+            model_id=segmentation_resnet_model_id)
+      if not shared_decoder:
+        level = aspp_decoder_level
+        segmentation_decoder = aspp.ASPP(
+            level=level, dilation_rates=aspp_dilation_rates)
+      else:
+        level = fpn_decoder_level
+        segmentation_decoder = None
+      segmentation_head = segmentation_heads.SegmentationHead(
+          num_classes=2,  # stuff and common class for things,
+          level=level,
+          num_convs=2)
+
+      model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
+          backbone,
+          decoder,
+          rpn_head,
+          detection_head,
+          roi_generator_obj,
+          roi_sampler_obj,
+          roi_aligner_obj,
+          detection_generator_obj,
+          mask_head,
+          mask_sampler_obj,
+          mask_roi_aligner_obj,
+          segmentation_backbone=segmentation_backbone,
+          segmentation_decoder=segmentation_decoder,
+          segmentation_head=segmentation_head,
+          class_agnostic_bbox_pred=class_agnostic_bbox_pred,
+          cascade_class_ensemble=cascade_class_ensemble,
+          min_level=min_level,
+          max_level=max_level,
+          num_scales=num_scales,
+          aspect_ratios=aspect_ratios,
+          anchor_size=anchor_size)
+
+      gt_boxes = np.array(
+          [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
+           [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
+          dtype=np.float32)
+      gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
+      gt_masks = np.ones((2, 3, 100, 100))
+
+      results = model(
+          images,
+          image_shape,
+          anchor_boxes,
+          gt_boxes,
+          gt_classes,
+          gt_masks,
+          training=training)
+
+    self.assertIn('rpn_boxes', results)
+    self.assertIn('rpn_scores', results)
+    if training:
+      self.assertIn('class_targets', results)
+      self.assertIn('box_targets', results)
+      self.assertIn('class_outputs', results)
+      self.assertIn('box_outputs', results)
+      self.assertIn('mask_outputs', results)
+    else:
+      self.assertIn('detection_boxes', results)
+      self.assertIn('detection_scores', results)
+      self.assertIn('detection_classes', results)
+      self.assertIn('num_detections', results)
+      self.assertIn('detection_masks', results)
+      self.assertIn('segmentation_outputs', results)
+      self.assertAllEqual(
+          [2, image_size[0] // (2**level), image_size[1] // (2**level), 2],
+          results['segmentation_outputs'].numpy().shape)
+
+  @combinations.generate(
+      combinations.combine(
+          shared_backbone=[True, False], shared_decoder=[True, False]))
+  def test_serialize_deserialize(self, shared_backbone, shared_decoder):
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=3, max_level=7, input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3, max_level=7, num_anchors_per_location=3)
+    detection_head = instance_heads.DetectionHead(num_classes=2)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    segmentation_resnet_model_id = 101
+    segmentation_output_stride = 16
+    aspp_dilation_rates = [6, 12, 18]
+    aspp_decoder_level = int(np.math.log2(segmentation_output_stride))
+    fpn_decoder_level = 3
+    shared_decoder = shared_decoder and shared_backbone
+    mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
+    mask_sampler_obj = mask_sampler.MaskSampler(
+        mask_target_size=28, num_sampled_masks=1)
+    mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+
+    if shared_backbone:
+      segmentation_backbone = None
+    else:
+      segmentation_backbone = resnet.ResNet(
+          model_id=segmentation_resnet_model_id)
+    if not shared_decoder:
+      level = aspp_decoder_level
+      segmentation_decoder = aspp.ASPP(
+          level=level, dilation_rates=aspp_dilation_rates)
+    else:
+      level = fpn_decoder_level
+      segmentation_decoder = None
+    segmentation_head = segmentation_heads.SegmentationHead(
+        num_classes=2,  # stuff and common class for things,
+        level=level,
+        num_convs=2)
+
+    model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        segmentation_backbone=segmentation_backbone,
+        segmentation_decoder=segmentation_decoder,
+        segmentation_head=segmentation_head,
+        min_level=3,
+        max_level=7,
+        num_scales=3,
+        aspect_ratios=[1.0],
+        anchor_size=3)
+
+    config = model.get_config()
+    new_model = panoptic_maskrcnn_model.PanopticMaskRCNNModel.from_config(
+        config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(model.get_config(), new_model.get_config())
+
+  @combinations.generate(
+      combinations.combine(
+          shared_backbone=[True, False], shared_decoder=[True, False]))
+  def test_checkpoint(self, shared_backbone, shared_decoder):
+    input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
+    backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
+    decoder = fpn.FPN(
+        min_level=3, max_level=7, input_specs=backbone.output_specs)
+    rpn_head = dense_prediction_heads.RPNHead(
+        min_level=3, max_level=7, num_anchors_per_location=3)
+    detection_head = instance_heads.DetectionHead(num_classes=2)
+    roi_generator_obj = roi_generator.MultilevelROIGenerator()
+    roi_sampler_obj = roi_sampler.ROISampler()
+    roi_aligner_obj = roi_aligner.MultilevelROIAligner()
+    detection_generator_obj = detection_generator.DetectionGenerator()
+    segmentation_resnet_model_id = 101
+    segmentation_output_stride = 16
+    aspp_dilation_rates = [6, 12, 18]
+    aspp_decoder_level = int(np.math.log2(segmentation_output_stride))
+    fpn_decoder_level = 3
+    shared_decoder = shared_decoder and shared_backbone
+    mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
+    mask_sampler_obj = mask_sampler.MaskSampler(
+        mask_target_size=28, num_sampled_masks=1)
+    mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
+
+    if shared_backbone:
+      segmentation_backbone = None
+    else:
+      segmentation_backbone = resnet.ResNet(
+          model_id=segmentation_resnet_model_id)
+    if not shared_decoder:
+      level = aspp_decoder_level
+      segmentation_decoder = aspp.ASPP(
+          level=level, dilation_rates=aspp_dilation_rates)
+    else:
+      level = fpn_decoder_level
+      segmentation_decoder = None
+    segmentation_head = segmentation_heads.SegmentationHead(
+        num_classes=2,  # stuff and common class for things,
+        level=level,
+        num_convs=2)
+
+    model = panoptic_maskrcnn_model.PanopticMaskRCNNModel(
+        backbone,
+        decoder,
+        rpn_head,
+        detection_head,
+        roi_generator_obj,
+        roi_sampler_obj,
+        roi_aligner_obj,
+        detection_generator_obj,
+        mask_head,
+        mask_sampler_obj,
+        mask_roi_aligner_obj,
+        segmentation_backbone=segmentation_backbone,
+        segmentation_decoder=segmentation_decoder,
+        segmentation_head=segmentation_head,
+        min_level=3,
+        max_level=7,
+        num_scales=3,
+        aspect_ratios=[1.0],
+        anchor_size=3)
+    expect_checkpoint_items = dict(
+        backbone=backbone,
+        decoder=decoder,
+        rpn_head=rpn_head,
+        detection_head=[detection_head])
+    expect_checkpoint_items['mask_head'] = mask_head
+    if not shared_backbone:
+      expect_checkpoint_items['segmentation_backbone'] = segmentation_backbone
+    if not shared_decoder:
+      expect_checkpoint_items['segmentation_decoder'] = segmentation_decoder
+    expect_checkpoint_items['segmentation_head'] = segmentation_head
+    self.assertAllEqual(expect_checkpoint_items, model.checkpoint_items)
+
+    # Test save and load checkpoints.
+    ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
+    save_dir = self.create_tempdir().full_path
+    ckpt.save(os.path.join(save_dir, 'ckpt'))
+
+    partial_ckpt = tf.train.Checkpoint(backbone=backbone)
+    partial_ckpt.restore(tf.train.latest_checkpoint(
+        save_dir)).expect_partial().assert_existing_objects_matched()
+
+    partial_ckpt_mask = tf.train.Checkpoint(
+        backbone=backbone, mask_head=mask_head)
+    partial_ckpt_mask.restore(tf.train.latest_checkpoint(
+        save_dir)).expect_partial().assert_existing_objects_matched()
+
+    if not shared_backbone:
+      partial_ckpt_segmentation = tf.train.Checkpoint(
+          segmentation_backbone=segmentation_backbone,
+          segmentation_decoder=segmentation_decoder,
+          segmentation_head=segmentation_head)
+    elif not shared_decoder:
+      partial_ckpt_segmentation = tf.train.Checkpoint(
+          segmentation_decoder=segmentation_decoder,
+          segmentation_head=segmentation_head)
+    else:
+      partial_ckpt_segmentation = tf.train.Checkpoint(
+          segmentation_head=segmentation_head)
+
+    partial_ckpt_segmentation.restore(tf.train.latest_checkpoint(
+        save_dir)).expect_partial().assert_existing_objects_matched()
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/beta/projects/simclr/heads/simclr_head.py
+++ b/official/vision/beta/projects/simclr/heads/simclr_head.py
@@ -97,7 +97,6 @@ class ProjectionHead(tf.keras.layers.Layer):
        'kernel_initializer': self._kernel_initializer,
        'kernel_regularizer': self._kernel_regularizer,
        'bias_regularizer': self._bias_regularizer,
-        'use_normalization': self._use_normalization,
        'norm_momentum': self._norm_momentum,
        'norm_epsilon': self._norm_epsilon
    }

--- a/official/vision/beta/projects/simclr/modeling/simclr_model.py
+++ b/official/vision/beta/projects/simclr/modeling/simclr_model.py
@@ -90,14 +90,15 @@ class SimCLRModel(tf.keras.Model):

    if training and self._mode == PRETRAIN:
      num_transforms = 2
+      # Split channels, and optionally apply extra batched augmentation.
+      # (bsz, h, w, c*num_transforms) -> [(bsz, h, w, c), ....]
+      features_list = tf.split(
+          inputs, num_or_size_splits=num_transforms, axis=-1)
+      # (num_transforms * bsz, h, w, c)
+      features = tf.concat(features_list, 0)
    else:
      num_transforms = 1
-
-    # Split channels, and optionally apply extra batched augmentation.
-    # (bsz, h, w, c*num_transforms) -> [(bsz, h, w, c), ....]
-    features_list = tf.split(inputs, num_or_size_splits=num_transforms, axis=-1)
-    # (num_transforms * bsz, h, w, c)
-    features = tf.concat(features_list, 0)
+      features = inputs

    # Base network forward pass.
    endpoints = self._backbone(features, training=training)

--- a/official/vision/beta/projects/simclr/tasks/simclr.py
+++ b/official/vision/beta/projects/simclr/tasks/simclr.py
@@ -415,7 +415,8 @@ class SimCLRFinetuneTask(base_task.Task):

    backbone = backbones.factory.build_backbone(
        input_specs=input_specs,
-        model_config=model_config,
+        backbone_config=model_config.backbone,
+        norm_activation_config=model_config.norm_activation,
        l2_regularizer=l2_regularizer)

    norm_activation_config = model_config.norm_activation

--- a/official/vision/beta/projects/vit/README.md
+++ b/official/vision/beta/projects/vit/README.md
+# Vision Transformer (ViT)
+
+**DISCLAIMER**: This implementation is still under development. No support will
+be provided during the development phase.
+
+[![Paper](http://img.shields.io/badge/Paper-arXiv.2010.11929-B3181B?logo=arXiv)](https://arxiv.org/abs/2010.11929)
+
+This repository is the implementations of Vision Transformer (ViT) in
+TensorFlow 2.
+
+* Paper title:
+[An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/pdf/2010.11929.pdf).
\ No newline at end of file
--- a/official/vision/beta/projects/vit/configs/__init__.py
+++ b/official/vision/beta/projects/vit/configs/__init__.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Configs package definition."""
+
+from official.vision.beta.projects.vit.configs import image_classification
--- a/official/vision/beta/projects/vit/configs/backbones.py
+++ b/official/vision/beta/projects/vit/configs/backbones.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Backbones configurations."""
+from typing import Optional
+
+import dataclasses
+
+from official.modeling import hyperparams
+
+
+@dataclasses.dataclass
+class Transformer(hyperparams.Config):
+  """Transformer config."""
+  mlp_dim: int = 1
+  num_heads: int = 1
+  num_layers: int = 1
+  attention_dropout_rate: float = 0.0
+  dropout_rate: float = 0.1
+
+
+@dataclasses.dataclass
+class VisionTransformer(hyperparams.Config):
+  """VisionTransformer config."""
+  model_name: str = 'vit-b16'
+  # pylint: disable=line-too-long
+  classifier: str = 'token'  # 'token' or 'gap'. If set to 'token', an extra classification token is added to sequence.
+  # pylint: enable=line-too-long
+  representation_size: int = 0
+  hidden_size: int = 1
+  patch_size: int = 16
+  transformer: Transformer = Transformer()
+
+
+@dataclasses.dataclass
+class Backbone(hyperparams.OneOfConfig):
+  """Configuration for backbones.
+
+  Attributes:
+    type: 'str', type of backbone be used, one the of fields below.
+    vit: vit backbone config.
+  """
+  type: Optional[str] = None
+  vit: VisionTransformer = VisionTransformer()
--- a/official/vision/beta/projects/vit/configs/image_classification.py
+++ b/official/vision/beta/projects/vit/configs/image_classification.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Image classification configuration definition."""
+import os
+from typing import List, Optional
+
+import dataclasses
+
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.core import task_factory
+from official.modeling import hyperparams
+from official.modeling import optimization
+from official.vision.beta.configs import common
+from official.vision.beta.configs import image_classification as img_cls_cfg
+from official.vision.beta.projects.vit.configs import backbones
+from official.vision.beta.tasks import image_classification
+
+DataConfig = img_cls_cfg.DataConfig
+
+
+@dataclasses.dataclass
+class ImageClassificationModel(hyperparams.Config):
+  """The model config."""
+  num_classes: int = 0
+  input_size: List[int] = dataclasses.field(default_factory=list)
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='vit', vit=backbones.VisionTransformer())
+  dropout_rate: float = 0.0
+  norm_activation: common.NormActivation = common.NormActivation(
+      use_sync_bn=False)
+  # Adds a BatchNormalization layer pre-GlobalAveragePooling in classification
+  add_head_batch_norm: bool = False
+
+
+@dataclasses.dataclass
+class Losses(hyperparams.Config):
+  one_hot: bool = True
+  label_smoothing: float = 0.0
+  l2_weight_decay: float = 0.0
+
+
+@dataclasses.dataclass
+class Evaluation(hyperparams.Config):
+  top_k: int = 5
+
+
+@dataclasses.dataclass
+class ImageClassificationTask(cfg.TaskConfig):
+  """The task config. Same as the classification task for convnets."""
+  model: ImageClassificationModel = ImageClassificationModel()
+  train_data: DataConfig = DataConfig(is_training=True)
+  validation_data: DataConfig = DataConfig(is_training=False)
+  losses: Losses = Losses()
+  evaluation: Evaluation = Evaluation()
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: str = 'all'  # all or backbone
+
+
+IMAGENET_TRAIN_EXAMPLES = 1281167
+IMAGENET_VAL_EXAMPLES = 50000
+IMAGENET_INPUT_PATH_BASE = 'imagenet-2012-tfrecord'
+
+# TODO(b/177942984): integrate the experiments to TF-vision.
+task_factory.register_task_cls(ImageClassificationTask)(
+    image_classification.ImageClassificationTask)
+
+
+@exp_factory.register_config_factory('vit_imagenet_pretrain')
+def image_classification_imagenet_vit_pretrain() -> cfg.ExperimentConfig:
+  """Image classification on imagenet with vision transformer."""
+  train_batch_size = 4096
+  eval_batch_size = 4096
+  steps_per_epoch = IMAGENET_TRAIN_EXAMPLES // train_batch_size
+  config = cfg.ExperimentConfig(
+      task=ImageClassificationTask(
+          model=ImageClassificationModel(
+              num_classes=1001,
+              input_size=[224, 224, 3],
+              backbone=backbones.Backbone(
+                  type='vit',
+                  vit=backbones.VisionTransformer(
+                      model_name='vit-b16', representation_size=768))),
+          losses=Losses(l2_weight_decay=0.0),
+          train_data=DataConfig(
+              input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size),
+          validation_data=DataConfig(
+              input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'valid*'),
+              is_training=False,
+              global_batch_size=eval_batch_size)),
+      trainer=cfg.TrainerConfig(
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          train_steps=300 * steps_per_epoch,
+          validation_steps=IMAGENET_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate': 0.3,
+                      'include_in_weight_decay': r'.*(kernel|weight):0$',
+                  }
+              },
+              'learning_rate': {
+                  'type': 'cosine',
+                  'cosine': {
+                      'initial_learning_rate': 0.003,
+                      'decay_steps': 300 * steps_per_epoch,
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 10000,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
+
+
+@exp_factory.register_config_factory('vit_imagenet_finetune')
+def image_classification_imagenet_vit_finetune() -> cfg.ExperimentConfig:
+  """Image classification on imagenet with vision transformer."""
+  train_batch_size = 512
+  eval_batch_size = 512
+  steps_per_epoch = IMAGENET_TRAIN_EXAMPLES // train_batch_size
+  config = cfg.ExperimentConfig(
+      task=ImageClassificationTask(
+          model=ImageClassificationModel(
+              num_classes=1001,
+              input_size=[384, 384, 3],
+              backbone=backbones.Backbone(
+                  type='vit',
+                  vit=backbones.VisionTransformer(model_name='vit-b16'))),
+          losses=Losses(l2_weight_decay=0.0),
+          train_data=DataConfig(
+              input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size),
+          validation_data=DataConfig(
+              input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'valid*'),
+              is_training=False,
+              global_batch_size=eval_batch_size)),
+      trainer=cfg.TrainerConfig(
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          train_steps=20000,
+          validation_steps=IMAGENET_VAL_EXAMPLES // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9,
+                      'global_clipnorm': 1.0,
+                  }
+              },
+              'learning_rate': {
+                  'type': 'cosine',
+                  'cosine': {
+                      'initial_learning_rate': 0.003,
+                      'decay_steps': 20000,
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+
+  return config
--- a/official/vision/beta/projects/vit/modeling/vit.py
+++ b/official/vision/beta/projects/vit/modeling/vit.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""VisionTransformer models."""
+import tensorflow as tf
+
+from official.modeling import activations
+from official.nlp import keras_nlp
+from official.vision.beta.modeling.backbones import factory
+
+layers = tf.keras.layers
+
+VIT_SPECS = {
+    'vit-testing':
+        dict(
+            hidden_size=1,
+            patch_size=16,
+            transformer=dict(mlp_dim=1, num_heads=1, num_layers=1),
+        ),
+    'vit-b16':
+        dict(
+            hidden_size=768,
+            patch_size=16,
+            transformer=dict(mlp_dim=3072, num_heads=12, num_layers=12),
+        ),
+    'vit-b32':
+        dict(
+            hidden_size=768,
+            patch_size=32,
+            transformer=dict(mlp_dim=3072, num_heads=12, num_layers=12),
+        ),
+    'vit-l16':
+        dict(
+            hidden_size=1024,
+            patch_size=16,
+            transformer=dict(mlp_dim=4096, num_heads=16, num_layers=24),
+        ),
+    'vit-l32':
+        dict(
+            hidden_size=1024,
+            patch_size=32,
+            transformer=dict(mlp_dim=4096, num_heads=16, num_layers=24),
+        ),
+    'vit-h14':
+        dict(
+            hidden_size=1280,
+            patch_size=14,
+            transformer=dict(mlp_dim=5120, num_heads=16, num_layers=32),
+        ),
+}
+
+
+class AddPositionEmbs(tf.keras.layers.Layer):
+  """Adds (optionally learned) positional embeddings to the inputs."""
+
+  def __init__(self, posemb_init=None, **kwargs):
+    super().__init__(**kwargs)
+    self.posemb_init = posemb_init
+
+  def build(self, inputs_shape):
+    pos_emb_shape = (1, inputs_shape[1], inputs_shape[2])
+    self.pos_embedding = self.add_weight(
+        'pos_embedding', pos_emb_shape, initializer=self.posemb_init)
+
+  def call(self, inputs, inputs_positions=None):
+    # inputs.shape is (batch_size, seq_len, emb_dim).
+    pos_embedding = tf.cast(self.pos_embedding, inputs.dtype)
+
+    return inputs + pos_embedding
+
+
+class TokenLayer(tf.keras.layers.Layer):
+  """A simple layer to wrap token parameters."""
+
+  def build(self, inputs_shape):
+    self.cls = self.add_weight(
+        'cls', (1, 1, inputs_shape[-1]), initializer='zeros')
+
+  def call(self, inputs):
+    cls = tf.cast(self.cls, inputs.dtype)
+    cls = cls + tf.zeros_like(inputs[:, 0:1])  # A hacky way to tile.
+    x = tf.concat([cls, inputs], axis=1)
+    return x
+
+
+class Encoder(tf.keras.layers.Layer):
+  """Transformer Encoder."""
+
+  def __init__(self,
+               num_layers,
+               mlp_dim,
+               num_heads,
+               dropout_rate=0.1,
+               attention_dropout_rate=0.1,
+               kernel_regularizer=None,
+               inputs_positions=None,
+               **kwargs):
+    super().__init__(**kwargs)
+    self._num_layers = num_layers
+    self._mlp_dim = mlp_dim
+    self._num_heads = num_heads
+    self._dropout_rate = dropout_rate
+    self._attention_dropout_rate = attention_dropout_rate
+    self._kernel_regularizer = kernel_regularizer
+    self._inputs_positions = inputs_positions
+
+  def build(self, input_shape):
+    self._pos_embed = AddPositionEmbs(
+        posemb_init=tf.keras.initializers.RandomNormal(stddev=0.02),
+        name='posembed_input')
+    self._dropout = layers.Dropout(rate=self._dropout_rate)
+
+    self._encoder_layers = []
+    # Set layer norm epsilons to 1e-6 to be consistent with JAX implementation.
+    # https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.LayerNorm.html
+    for _ in range(self._num_layers):
+      encoder_layer = keras_nlp.layers.TransformerEncoderBlock(
+          inner_activation=activations.gelu,
+          num_attention_heads=self._num_heads,
+          inner_dim=self._mlp_dim,
+          output_dropout=self._dropout_rate,
+          attention_dropout=self._attention_dropout_rate,
+          kernel_regularizer=self._kernel_regularizer,
+          norm_first=True,
+          norm_epsilon=1e-6)
+      self._encoder_layers.append(encoder_layer)
+    self._norm = layers.LayerNormalization(epsilon=1e-6)
+    super().build(input_shape)
+
+  def call(self, inputs, training=None):
+    x = self._pos_embed(inputs, inputs_positions=self._inputs_positions)
+    x = self._dropout(x, training=training)
+
+    for encoder_layer in self._encoder_layers:
+      x = encoder_layer(x, training=training)
+    x = self._norm(x)
+    return x
+
+
+class VisionTransformer(tf.keras.Model):
+  """Class to build VisionTransformer family model."""
+
+  def __init__(self,
+               mlp_dim=3072,
+               num_heads=12,
+               num_layers=12,
+               attention_dropout_rate=0.0,
+               dropout_rate=0.1,
+               input_specs=layers.InputSpec(shape=[None, None, None, 3]),
+               patch_size=16,
+               hidden_size=768,
+               representation_size=0,
+               classifier='token',
+               kernel_regularizer=None):
+    """VisionTransformer initialization function."""
+    inputs = tf.keras.Input(shape=input_specs.shape[1:])
+
+    x = layers.Conv2D(
+        filters=hidden_size,
+        kernel_size=patch_size,
+        strides=patch_size,
+        padding='valid',
+        kernel_regularizer=kernel_regularizer)(
+            inputs)
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      rows_axis, cols_axis = (1, 2)
+    else:
+      rows_axis, cols_axis = (2, 3)
+      # The reshape below assumes the data_format is 'channels_last,' so
+      # transpose to that. Once the data is flattened by the reshape, the
+      # data_format is irrelevant, so no need to update
+      # tf.keras.backend.image_data_format.
+      x = tf.transpose(x, perm=[0, 2, 3, 1])
+    seq_len = (input_specs.shape[rows_axis] // patch_size) * (
+        input_specs.shape[cols_axis] // patch_size)
+    x = tf.reshape(x, [-1, seq_len, hidden_size])
+
+    # If we want to add a class token, add it here.
+    if classifier == 'token':
+      x = TokenLayer(name='cls')(x)
+
+    x = Encoder(
+        num_layers=num_layers,
+        mlp_dim=mlp_dim,
+        num_heads=num_heads,
+        dropout_rate=dropout_rate,
+        attention_dropout_rate=attention_dropout_rate,
+        kernel_regularizer=kernel_regularizer)(
+            x)
+
+    if classifier == 'token':
+      x = x[:, 0]
+    elif classifier == 'gap':
+      x = tf.reduce_mean(x, axis=1)
+
+    if representation_size:
+      x = tf.keras.layers.Dense(
+          representation_size,
+          kernel_regularizer=kernel_regularizer,
+          name='pre_logits')(
+              x)
+      x = tf.nn.tanh(x)
+    else:
+      x = tf.identity(x, name='pre_logits')
+    endpoints = {
+        'pre_logits':
+            tf.reshape(x, [-1, 1, 1, representation_size or hidden_size])
+    }
+
+    super(VisionTransformer, self).__init__(inputs=inputs, outputs=endpoints)
+
+
+@factory.register_backbone_builder('vit')
+def build_vit(input_specs,
+              backbone_config,
+              norm_activation_config,
+              l2_regularizer=None):
+  """Build ViT model."""
+  del norm_activation_config
+  backbone_type = backbone_config.type
+  backbone_cfg = backbone_config.get()
+  assert backbone_type == 'vit', (f'Inconsistent backbone type '
+                                  f'{backbone_type}')
+  backbone_cfg.override(VIT_SPECS[backbone_cfg.model_name])
+
+  return VisionTransformer(
+      mlp_dim=backbone_cfg.transformer.mlp_dim,
+      num_heads=backbone_cfg.transformer.num_heads,
+      num_layers=backbone_cfg.transformer.num_layers,
+      attention_dropout_rate=backbone_cfg.transformer.attention_dropout_rate,
+      dropout_rate=backbone_cfg.transformer.dropout_rate,
+      input_specs=input_specs,
+      patch_size=backbone_cfg.patch_size,
+      hidden_size=backbone_cfg.hidden_size,
+      representation_size=backbone_cfg.representation_size,
+      classifier=backbone_cfg.classifier,
+      kernel_regularizer=l2_regularizer)
--- a/official/vision/beta/projects/vit/modeling/vit_test.py
+++ b/official/vision/beta/projects/vit/modeling/vit_test.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for VIT."""
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.beta.projects.vit.modeling import vit
+
+
+class VisionTransformerTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters(
+      (224, 85798656),
+      (256, 85844736),
+  )
+  def test_network_creation(self, input_size, params_count):
+    """Test creation of VisionTransformer family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_specs = tf.keras.layers.InputSpec(
+        shape=[2, input_size, input_size, 3])
+    network = vit.VisionTransformer(input_specs=input_specs)
+
+    inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
+    _ = network(inputs)
+    self.assertEqual(network.count_params(), params_count)
+
+
+if __name__ == '__main__':
+  tf.test.main()