Merge branch 'master' of https://github.com/tensorflow/models into detection_generator_pr_2

b92025a9 · anivegesana · 1b425791 · 37536370 · b92025a9 · b92025a9
Commit b92025a9 authored Aug 18, 2021 by anivegesana
20 changed files
--- a/official/vision/beta/modeling/layers/detection_generator.py
+++ b/official/vision/beta/modeling/layers/detection_generator.py
@@ -514,22 +514,22 @@ class DetectionGenerator(tf.keras.layers.Layer):
      }

    if self._config_dict['use_batched_nms']:
-      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+      (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
          _generate_detections_batched(
-              decoded_boxes,
-              box_scores,
+              decoded_boxes, box_scores,
              self._config_dict['pre_nms_score_threshold'],
              self._config_dict['nms_iou_threshold'],
              self._config_dict['max_num_detections']))
    else:
-      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
-          _generate_detections_v2(
+      (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, _) = (
+          _generate_detections_v1(
              decoded_boxes,
              box_scores,
-              self._config_dict['pre_nms_top_k'],
-              self._config_dict['pre_nms_score_threshold'],
-              self._config_dict['nms_iou_threshold'],
-              self._config_dict['max_num_detections']))
+              pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+              pre_nms_score_threshold=self
+              ._config_dict['pre_nms_score_threshold'],
+              nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+              max_num_detections=self._config_dict['max_num_detections']))

    # Adds 1 to offset the background class which has index 0.
    nmsed_classes += 1
@@ -714,35 +714,26 @@ class MultilevelDetectionGenerator(tf.keras.layers.Layer):
      if raw_attributes:
        raise ValueError('Attribute learning is not supported for batched NMS.')

-      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+      (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = (
          _generate_detections_batched(
-              boxes,
-              scores,
-              self._config_dict['pre_nms_score_threshold'],
+              boxes, scores, self._config_dict['pre_nms_score_threshold'],
              self._config_dict['nms_iou_threshold'],
              self._config_dict['max_num_detections']))
      # Set `nmsed_attributes` to None for batched NMS.
      nmsed_attributes = {}
    else:
-      if raw_attributes:
-        nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes = (
-            _generate_detections_v1(
-                boxes,
-                scores,
-                attributes=attributes if raw_attributes else None,
-                pre_nms_top_k=self._config_dict['pre_nms_top_k'],
-                pre_nms_score_threshold=self
-                ._config_dict['pre_nms_score_threshold'],
-                nms_iou_threshold=self._config_dict['nms_iou_threshold'],
-                max_num_detections=self._config_dict['max_num_detections']))
-      else:
-        nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
-            _generate_detections_v2(
-                boxes, scores, self._config_dict['pre_nms_top_k'],
-                self._config_dict['pre_nms_score_threshold'],
-                self._config_dict['nms_iou_threshold'],
-                self._config_dict['max_num_detections']))
-        nmsed_attributes = {}
+      (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections,
+       nmsed_attributes) = (
+           _generate_detections_v1(
+               boxes,
+               scores,
+               attributes=attributes if raw_attributes else None,
+               pre_nms_top_k=self._config_dict['pre_nms_top_k'],
+               pre_nms_score_threshold=self
+               ._config_dict['pre_nms_score_threshold'],
+               nms_iou_threshold=self._config_dict['nms_iou_threshold'],
+               max_num_detections=self._config_dict['max_num_detections']))
+
    # Adds 1 to offset the background class which has index 0.
    nmsed_classes += 1


--- a/official/vision/beta/modeling/layers/nn_layers.py
+++ b/official/vision/beta/modeling/layers/nn_layers.py
@@ -165,7 +165,8 @@ class SqueezeExcitation(tf.keras.layers.Layer):

  def build(self, input_shape):
    num_reduced_filters = make_divisible(
-        self._in_filters * self._se_ratio, divisor=self._divisible_by)
+        max(1, int(self._in_filters * self._se_ratio)),
+        divisor=self._divisible_by)

    self._se_reduce = tf.keras.layers.Conv2D(
        filters=num_reduced_filters,
@@ -424,7 +425,7 @@ class PositionalEncoding(tf.keras.layers.Layer):
    self._rezero = Scale(initializer=initializer, name='rezero')
    state_prefix = state_prefix if state_prefix is not None else ''
    self._state_prefix = state_prefix
-    self._frame_count_name = f'{state_prefix}/pos_enc_frame_count'
+    self._frame_count_name = f'{state_prefix}_pos_enc_frame_count'

  def get_config(self):
    """Returns a dictionary containing the config used for initialization."""
@@ -522,7 +523,7 @@ class PositionalEncoding(tf.keras.layers.Layer):
      inputs: An input `tf.Tensor`.
      states: A `dict` of states such that, if any of the keys match for this
        layer, will overwrite the contents of the buffer(s). Expected keys
-        include `state_prefix + '/pos_enc_frame_count'`.
+        include `state_prefix + '_pos_enc_frame_count'`.
      output_states: A `bool`. If True, returns the output tensor and output
        states. Returns just the output tensor otherwise.

@@ -586,8 +587,8 @@ class GlobalAveragePool3D(tf.keras.layers.Layer):
    state_prefix = state_prefix if state_prefix is not None else ''
    self._state_prefix = state_prefix

-    self._state_name = f'{state_prefix}/pool_buffer'
-    self._frame_count_name = f'{state_prefix}/pool_frame_count'
+    self._state_name = f'{state_prefix}_pool_buffer'
+    self._frame_count_name = f'{state_prefix}_pool_frame_count'

  def get_config(self):
    """Returns a dictionary containing the config used for initialization."""
@@ -610,8 +611,8 @@ class GlobalAveragePool3D(tf.keras.layers.Layer):
      inputs: An input `tf.Tensor`.
      states: A `dict` of states such that, if any of the keys match for this
        layer, will overwrite the contents of the buffer(s).
-        Expected keys include `state_prefix + '/pool_buffer'` and
-        `state_prefix + '/pool_frame_count'`.
+        Expected keys include `state_prefix + '__pool_buffer'` and
+        `state_prefix + '__pool_frame_count'`.
      output_states: A `bool`. If True, returns the output tensor and output
        states. Returns just the output tensor otherwise.


--- a/official/vision/beta/modeling/maskrcnn_model.py
+++ b/official/vision/beta/modeling/maskrcnn_model.py
@@ -14,7 +14,7 @@

 """Mask R-CNN model."""

-from typing import Any, List, Mapping, Optional, Union
+from typing import Any, List, Mapping, Optional, Tuple, Union

 import tensorflow as tf

@@ -143,6 +143,34 @@ class MaskRCNNModel(tf.keras.Model):
           gt_classes: Optional[tf.Tensor] = None,
           gt_masks: Optional[tf.Tensor] = None,
           training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+
+    model_outputs, intermediate_outputs = self._call_box_outputs(
+        images=images, image_shape=image_shape, anchor_boxes=anchor_boxes,
+        gt_boxes=gt_boxes, gt_classes=gt_classes, training=training)
+    if not self._include_mask:
+      return model_outputs
+
+    model_mask_outputs = self._call_mask_outputs(
+        model_box_outputs=model_outputs,
+        features=intermediate_outputs['features'],
+        current_rois=intermediate_outputs['current_rois'],
+        matched_gt_indices=intermediate_outputs['matched_gt_indices'],
+        matched_gt_boxes=intermediate_outputs['matched_gt_boxes'],
+        matched_gt_classes=intermediate_outputs['matched_gt_classes'],
+        gt_masks=gt_masks,
+        training=training)
+    model_outputs.update(model_mask_outputs)
+    return model_outputs
+
+  def _call_box_outputs(
+      self, images: tf.Tensor,
+      image_shape: tf.Tensor,
+      anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+      gt_boxes: Optional[tf.Tensor] = None,
+      gt_classes: Optional[tf.Tensor] = None,
+      training: Optional[bool] = None) -> Tuple[
+          Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]:
+    """Implementation of the Faster-RCNN logic for boxes."""
    model_outputs = {}

    # Feature extraction.
@@ -239,9 +267,28 @@ class MaskRCNNModel(tf.keras.Model):
            'decoded_box_scores': detections['decoded_box_scores']
        })

-    if not self._include_mask:
-      return model_outputs
-
+    intermediate_outputs = {
+        'matched_gt_boxes': matched_gt_boxes,
+        'matched_gt_indices': matched_gt_indices,
+        'matched_gt_classes': matched_gt_classes,
+        'features': features,
+        'current_rois': current_rois,
+    }
+    return (model_outputs, intermediate_outputs)
+
+  def _call_mask_outputs(
+      self,
+      model_box_outputs: Mapping[str, tf.Tensor],
+      features: tf.Tensor,
+      current_rois: tf.Tensor,
+      matched_gt_indices: tf.Tensor,
+      matched_gt_boxes: tf.Tensor,
+      matched_gt_classes: tf.Tensor,
+      gt_masks: tf.Tensor,
+      training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+    """Implementation of Mask-RCNN mask prediction logic."""
+
+    model_outputs = dict(model_box_outputs)
    if training:
      current_rois, roi_classes, roi_masks = self.mask_sampler(
          current_rois, matched_gt_boxes, matched_gt_classes,

--- a/official/vision/beta/ops/box_ops.py
+++ b/official/vision/beta/ops/box_ops.py
@@ -624,6 +624,76 @@ def bbox_overlap(boxes, gt_boxes):
    return iou


+def bbox_generalized_overlap(boxes, gt_boxes):
+  """Calculates the GIOU between proposal and ground truth boxes.
+
+  The generalized intersection of union is an adjustment of the traditional IOU
+  metric which provides continuous updates even for predictions with no overlap.
+  This metric is defined in https://giou.stanford.edu/GIoU.pdf. Note, some
+  `gt_boxes` may have been padded. The returned `giou` tensor for these boxes
+  will be -1.
+
+  Args:
+    boxes: a `Tensor` with a shape of [batch_size, N, 4]. N is the number of
+      proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
+      last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
+    gt_boxes: a `Tensor` with a shape of [batch_size, max_num_instances, 4].
+      This tensor may have paddings with a negative value and will also be in
+      the [ymin, xmin, ymax, xmax] format.
+
+  Returns:
+    giou: a `Tensor` with as a shape of [batch_size, N, max_num_instances].
+  """
+  with tf.name_scope('bbox_generalized_overlap'):
+    assert boxes.shape.as_list(
+    )[-1] == 4, 'Boxes must be defined by 4 coordinates.'
+    assert gt_boxes.shape.as_list(
+    )[-1] == 4, 'Groundtruth boxes must be defined by 4 coordinates.'
+
+    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
+        value=boxes, num_or_size_splits=4, axis=2)
+    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
+        value=gt_boxes, num_or_size_splits=4, axis=2)
+
+    # Calculates the hull area for each pair of boxes, with one from
+    # boxes and the other from gt_boxes.
+    # Outputs for coordinates are of shape [batch_size, N, max_num_instances]
+    h_xmin = tf.minimum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
+    h_xmax = tf.maximum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
+    h_ymin = tf.minimum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
+    h_ymax = tf.maximum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
+    h_area = tf.maximum((h_xmax - h_xmin), 0) * tf.maximum((h_ymax - h_ymin), 0)
+    # Add a small epsilon to avoid divide-by-zero.
+    h_area = h_area + 1e-8
+
+    # Calculates the intersection area.
+    i_xmin = tf.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
+    i_xmax = tf.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
+    i_ymin = tf.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
+    i_ymax = tf.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
+    i_area = tf.maximum((i_xmax - i_xmin), 0) * tf.maximum((i_ymax - i_ymin), 0)
+
+    # Calculates the union area.
+    bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
+    gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
+
+    # Adds a small epsilon to avoid divide-by-zero.
+    u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
+
+    # Calculates IoU.
+    iou = i_area / u_area
+    # Calculates GIoU.
+    giou = iou - (h_area - u_area) / h_area
+
+    # Fills -1 for GIoU entries between the padded ground truth boxes.
+    gt_invalid_mask = tf.less(
+        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
+    padding_mask = tf.broadcast_to(
+        tf.transpose(gt_invalid_mask, [0, 2, 1]), tf.shape(giou))
+    giou = tf.where(padding_mask, -tf.ones_like(giou), giou)
+    return giou
+
+
 def box_matching(boxes, gt_boxes, gt_classes):
  """Match boxes to groundtruth boxes.


--- a/official/vision/beta/projects/deepmac_maskrcnn/configs/deep_mask_head_rcnn.py
+++ b/official/vision/beta/projects/deepmac_maskrcnn/configs/deep_mask_head_rcnn.py
@@ -22,6 +22,9 @@ import dataclasses
 from official.core import config_definitions as cfg
 from official.core import exp_factory
 from official.modeling import optimization
+from official.vision.beta.configs import backbones
+from official.vision.beta.configs import common
+from official.vision.beta.configs import decoders
 from official.vision.beta.configs import maskrcnn as maskrcnn_config
 from official.vision.beta.configs import retinanet as retinanet_config

@@ -59,20 +62,18 @@ def deep_mask_head_rcnn_resnetfpn_coco() -> cfg.ExperimentConfig:
          annotation_file=os.path.join(maskrcnn_config.COCO_INPUT_PATH_BASE,
                                       'instances_val2017.json'),
          model=DeepMaskHeadRCNN(
-              num_classes=91,
-              input_size=[1024, 1024, 3],
-              include_mask=True),  # pytype: disable=wrong-keyword-args
+              num_classes=91, input_size=[1024, 1024, 3], include_mask=True),  # pytype: disable=wrong-keyword-args
          losses=maskrcnn_config.Losses(l2_weight_decay=0.00004),
          train_data=maskrcnn_config.DataConfig(
-              input_path=os.path.join(
-                  maskrcnn_config.COCO_INPUT_PATH_BASE, 'train*'),
+              input_path=os.path.join(maskrcnn_config.COCO_INPUT_PATH_BASE,
+                                      'train*'),
              is_training=True,
              global_batch_size=global_batch_size,
              parser=maskrcnn_config.Parser(
                  aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.25)),
          validation_data=maskrcnn_config.DataConfig(
-              input_path=os.path.join(
-                  maskrcnn_config.COCO_INPUT_PATH_BASE, 'val*'),
+              input_path=os.path.join(maskrcnn_config.COCO_INPUT_PATH_BASE,
+                                      'val*'),
              is_training=False,
              global_batch_size=8)),  # pytype: disable=wrong-keyword-args
      trainer=cfg.TrainerConfig(
@@ -110,3 +111,87 @@ def deep_mask_head_rcnn_resnetfpn_coco() -> cfg.ExperimentConfig:
      ])

  return config
+
+
+@exp_factory.register_config_factory('deep_mask_head_rcnn_spinenet_coco')
+def deep_mask_head_rcnn_spinenet_coco() -> cfg.ExperimentConfig:
+  """COCO object detection with Mask R-CNN with SpineNet backbone."""
+  steps_per_epoch = 463
+  coco_val_samples = 5000
+  train_batch_size = 256
+  eval_batch_size = 8
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'),
+      task=DeepMaskHeadRCNNTask(
+          annotation_file=os.path.join(maskrcnn_config.COCO_INPUT_PATH_BASE,
+                                       'instances_val2017.json'),  # pytype: disable=wrong-keyword-args
+          model=DeepMaskHeadRCNN(
+              backbone=backbones.Backbone(
+                  type='spinenet',
+                  spinenet=backbones.SpineNet(
+                      model_id='49',
+                      min_level=3,
+                      max_level=7,
+                  )),
+              decoder=decoders.Decoder(
+                  type='identity', identity=decoders.Identity()),
+              anchor=maskrcnn_config.Anchor(anchor_size=3),
+              norm_activation=common.NormActivation(use_sync_bn=True),
+              num_classes=91,
+              input_size=[640, 640, 3],
+              min_level=3,
+              max_level=7,
+              include_mask=True),   # pytype: disable=wrong-keyword-args
+          losses=maskrcnn_config.Losses(l2_weight_decay=0.00004),
+          train_data=maskrcnn_config.DataConfig(
+              input_path=os.path.join(maskrcnn_config.COCO_INPUT_PATH_BASE,
+                                      'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=maskrcnn_config.Parser(
+                  aug_rand_hflip=True, aug_scale_min=0.5, aug_scale_max=2.0)),
+          validation_data=maskrcnn_config.DataConfig(
+              input_path=os.path.join(maskrcnn_config.COCO_INPUT_PATH_BASE,
+                                      'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              drop_remainder=False)),   # pytype: disable=wrong-keyword-args
+      trainer=cfg.TrainerConfig(
+          train_steps=steps_per_epoch * 350,
+          validation_steps=coco_val_samples // eval_batch_size,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'sgd',
+                  'sgd': {
+                      'momentum': 0.9
+                  }
+              },
+              'learning_rate': {
+                  'type': 'stepwise',
+                  'stepwise': {
+                      'boundaries': [
+                          steps_per_epoch * 320, steps_per_epoch * 340
+                      ],
+                      'values': [0.32, 0.032, 0.0032],
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 2000,
+                      'warmup_learning_rate': 0.0067
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+          'task.model.min_level == task.model.backbone.spinenet.min_level',
+          'task.model.max_level == task.model.backbone.spinenet.max_level',
+      ])
+  return config
--- a/official/vision/beta/projects/deepmac_maskrcnn/configs/deep_mask_head_rcnn_config_test.py
+++ b/official/vision/beta/projects/deepmac_maskrcnn/configs/deep_mask_head_rcnn_config_test.py
@@ -25,6 +25,10 @@ class DeepMaskHeadRcnnConfigTest(tf.test.TestCase):
    config = deep_mask_head_rcnn.deep_mask_head_rcnn_resnetfpn_coco()
    self.assertIsInstance(config.task, deep_mask_head_rcnn.DeepMaskHeadRCNNTask)

+  def test_config_spinenet(self):
+    config = deep_mask_head_rcnn.deep_mask_head_rcnn_spinenet_coco()
+    self.assertIsInstance(config.task, deep_mask_head_rcnn.DeepMaskHeadRCNNTask)
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/vision/beta/projects/deepmac_maskrcnn/modeling/maskrcnn_model.py
+++ b/official/vision/beta/projects/deepmac_maskrcnn/modeling/maskrcnn_model.py
@@ -14,12 +14,14 @@

 """Mask R-CNN model."""

+from typing import List, Mapping, Optional, Union
+
 # Import libraries

 from absl import logging
 import tensorflow as tf

-from official.vision.beta.ops import box_ops
+from official.vision.beta.modeling import maskrcnn_model


 def resize_as(source, size):
@@ -30,21 +32,30 @@ def resize_as(source, size):


 @tf.keras.utils.register_keras_serializable(package='Vision')
-class DeepMaskRCNNModel(tf.keras.Model):
+class DeepMaskRCNNModel(maskrcnn_model.MaskRCNNModel):
  """The Mask R-CNN model."""

  def __init__(self,
-               backbone,
-               decoder,
-               rpn_head,
-               detection_head,
-               roi_generator,
-               roi_sampler,
-               roi_aligner,
-               detection_generator,
-               mask_head=None,
-               mask_sampler=None,
-               mask_roi_aligner=None,
+               backbone: tf.keras.Model,
+               decoder: tf.keras.Model,
+               rpn_head: tf.keras.layers.Layer,
+               detection_head: Union[tf.keras.layers.Layer,
+                                     List[tf.keras.layers.Layer]],
+               roi_generator: tf.keras.layers.Layer,
+               roi_sampler: Union[tf.keras.layers.Layer,
+                                  List[tf.keras.layers.Layer]],
+               roi_aligner: tf.keras.layers.Layer,
+               detection_generator: tf.keras.layers.Layer,
+               mask_head: Optional[tf.keras.layers.Layer] = None,
+               mask_sampler: Optional[tf.keras.layers.Layer] = None,
+               mask_roi_aligner: Optional[tf.keras.layers.Layer] = None,
+               class_agnostic_bbox_pred: bool = False,
+               cascade_class_ensemble: bool = False,
+               min_level: Optional[int] = None,
+               max_level: Optional[int] = None,
+               num_scales: Optional[int] = None,
+               aspect_ratios: Optional[List[float]] = None,
+               anchor_size: Optional[float] = None,
               use_gt_boxes_for_masks=False,
               **kwargs):
    """Initializes the Mask R-CNN model.
@@ -53,122 +64,99 @@ class DeepMaskRCNNModel(tf.keras.Model):
      backbone: `tf.keras.Model`, the backbone network.
      decoder: `tf.keras.Model`, the decoder network.
      rpn_head: the RPN head.
-      detection_head: the detection head.
+      detection_head: the detection head or a list of heads.
      roi_generator: the ROI generator.
-      roi_sampler: the ROI sampler.
+      roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
+        detection heads.
      roi_aligner: the ROI aligner.
      detection_generator: the detection generator.
      mask_head: the mask head.
      mask_sampler: the mask sampler.
      mask_roi_aligner: the ROI alginer for mask prediction.
-      use_gt_boxes_for_masks: bool, if set, crop using groundtruth boxes
-        instead of proposals for training mask head
+      class_agnostic_bbox_pred: if True, perform class agnostic bounding box
+        prediction. Needs to be `True` for Cascade RCNN models.
+      cascade_class_ensemble: if True, ensemble classification scores over all
+        detection heads.
+      min_level: Minimum level in output feature maps.
+      max_level: Maximum level in output feature maps.
+      num_scales: A number representing intermediate scales added on each level.
+        For instances, num_scales=2 adds one additional intermediate anchor
+        scales [2^0, 2^0.5] on each level.
+      aspect_ratios: A list representing the aspect raito anchors added on each
+        level. The number indicates the ratio of width to height. For instances,
+        aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
+      anchor_size: A number representing the scale of size of the base anchor to
+        the feature stride 2^level.
+      use_gt_boxes_for_masks: bool, if set, crop using groundtruth boxes instead
+        of proposals for training mask head
      **kwargs: keyword arguments to be passed.
    """
-    super(DeepMaskRCNNModel, self).__init__(**kwargs)
-    self._config_dict = {
-        'backbone': backbone,
-        'decoder': decoder,
-        'rpn_head': rpn_head,
-        'detection_head': detection_head,
-        'roi_generator': roi_generator,
-        'roi_sampler': roi_sampler,
-        'roi_aligner': roi_aligner,
-        'detection_generator': detection_generator,
-        'mask_head': mask_head,
-        'mask_sampler': mask_sampler,
-        'mask_roi_aligner': mask_roi_aligner,
-        'use_gt_boxes_for_masks': use_gt_boxes_for_masks
-    }
-    self.backbone = backbone
-    self.decoder = decoder
-    self.rpn_head = rpn_head
-    self.detection_head = detection_head
-    self.roi_generator = roi_generator
-    self.roi_sampler = roi_sampler
-    self.roi_aligner = roi_aligner
-    self.detection_generator = detection_generator
-    self._include_mask = mask_head is not None
-    self.mask_head = mask_head
-    if self._include_mask and mask_sampler is None:
-      raise ValueError('`mask_sampler` is not provided in Mask R-CNN.')
-    self.mask_sampler = mask_sampler
-    if self._include_mask and mask_roi_aligner is None:
-      raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.')
-    self.mask_roi_aligner = mask_roi_aligner
+    super(DeepMaskRCNNModel, self).__init__(
+        backbone=backbone,
+        decoder=decoder,
+        rpn_head=rpn_head,
+        detection_head=detection_head,
+        roi_generator=roi_generator,
+        roi_sampler=roi_sampler,
+        roi_aligner=roi_aligner,
+        detection_generator=detection_generator,
+        mask_head=mask_head,
+        mask_sampler=mask_sampler,
+        mask_roi_aligner=mask_roi_aligner,
+        class_agnostic_bbox_pred=class_agnostic_bbox_pred,
+        cascade_class_ensemble=cascade_class_ensemble,
+        min_level=min_level,
+        max_level=max_level,
+        num_scales=num_scales,
+        aspect_ratios=aspect_ratios,
+        anchor_size=anchor_size,
+        **kwargs)
+
+    self._config_dict['use_gt_boxes_for_masks'] = use_gt_boxes_for_masks

  def call(self,
-           images,
-           image_shape,
-           anchor_boxes=None,
-           gt_boxes=None,
-           gt_classes=None,
-           gt_masks=None,
-           training=None):
-    model_outputs = {}
-
-    # Feature extraction.
-    features = self.backbone(images)
-    if self.decoder:
-      features = self.decoder(features)
-
-    # Region proposal network.
-    rpn_scores, rpn_boxes = self.rpn_head(features)
-
-    model_outputs.update({
-        'rpn_boxes': rpn_boxes,
-        'rpn_scores': rpn_scores
-    })
-
-    # Generate RoIs.
-    rois, _ = self.roi_generator(
-        rpn_boxes, rpn_scores, anchor_boxes, image_shape, training)
-
-    if training:
-      rois = tf.stop_gradient(rois)
-
-      rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
-          self.roi_sampler(rois, gt_boxes, gt_classes))
-      # Assign target for the 2nd stage classification.
-      box_targets = box_ops.encode_boxes(
-          matched_gt_boxes, rois, weights=[10.0, 10.0, 5.0, 5.0])
-      # If the target is background, the box target is set to all 0s.
-      box_targets = tf.where(
-          tf.tile(
-              tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
-              [1, 1, 4]),
-          tf.zeros_like(box_targets),
-          box_targets)
-      model_outputs.update({
-          'class_targets': matched_gt_classes,
-          'box_targets': box_targets,
-      })
-
-    # RoI align.
-    roi_features = self.roi_aligner(features, rois)
-
-    # Detection head.
-    raw_scores, raw_boxes = self.detection_head(roi_features)
-
-    if training:
-      model_outputs.update({
-          'class_outputs': raw_scores,
-          'box_outputs': raw_boxes,
-      })
-    else:
-      # Post-processing.
-      detections = self.detection_generator(
-          raw_boxes, raw_scores, rois, image_shape)
-      model_outputs.update({
-          'detection_boxes': detections['detection_boxes'],
-          'detection_scores': detections['detection_scores'],
-          'detection_classes': detections['detection_classes'],
-          'num_detections': detections['num_detections'],
-      })
-
+           images: tf.Tensor,
+           image_shape: tf.Tensor,
+           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+           gt_boxes: Optional[tf.Tensor] = None,
+           gt_classes: Optional[tf.Tensor] = None,
+           gt_masks: Optional[tf.Tensor] = None,
+           training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+
+    model_outputs, intermediate_outputs = self._call_box_outputs(
+        images=images, image_shape=image_shape, anchor_boxes=anchor_boxes,
+        gt_boxes=gt_boxes, gt_classes=gt_classes, training=training)
    if not self._include_mask:
      return model_outputs

+    model_mask_outputs = self._call_mask_outputs(
+        model_box_outputs=model_outputs,
+        features=intermediate_outputs['features'],
+        current_rois=intermediate_outputs['current_rois'],
+        matched_gt_indices=intermediate_outputs['matched_gt_indices'],
+        matched_gt_boxes=intermediate_outputs['matched_gt_boxes'],
+        matched_gt_classes=intermediate_outputs['matched_gt_classes'],
+        gt_masks=gt_masks,
+        gt_classes=gt_classes,
+        gt_boxes=gt_boxes,
+        training=training)
+    model_outputs.update(model_mask_outputs)
+    return model_outputs
+
+  def _call_mask_outputs(
+      self,
+      model_box_outputs: Mapping[str, tf.Tensor],
+      features: tf.Tensor,
+      current_rois: tf.Tensor,
+      matched_gt_indices: tf.Tensor,
+      matched_gt_boxes: tf.Tensor,
+      matched_gt_classes: tf.Tensor,
+      gt_masks: tf.Tensor,
+      gt_classes: tf.Tensor,
+      gt_boxes: tf.Tensor,
+      training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
+
+    model_outputs = dict(model_box_outputs)
    if training:
      if self._config_dict['use_gt_boxes_for_masks']:
        mask_size = (
@@ -184,11 +172,8 @@ class DeepMaskRCNNModel(tf.keras.Model):
        })
      else:
        rois, roi_classes, roi_masks = self.mask_sampler(
-            rois,
-            matched_gt_boxes,
-            matched_gt_classes,
-            matched_gt_indices,
-            gt_masks)
+            current_rois, matched_gt_boxes, matched_gt_classes,
+            matched_gt_indices, gt_masks)
        roi_masks = tf.stop_gradient(roi_masks)
        model_outputs.update({
            'mask_class_targets': roi_classes,
@@ -219,24 +204,3 @@ class DeepMaskRCNNModel(tf.keras.Model):
          'detection_masks': tf.math.sigmoid(raw_masks),
      })
    return model_outputs
-
-  @property
-  def checkpoint_items(self):
-    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(
-        backbone=self.backbone,
-        rpn_head=self.rpn_head,
-        detection_head=self.detection_head)
-    if self.decoder is not None:
-      items.update(decoder=self.decoder)
-    if self._include_mask:
-      items.update(mask_head=self.mask_head)
-
-    return items
-
-  def get_config(self):
-    return self._config_dict
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
--- a/official/vision/beta/projects/example/README.md
+++ b/official/vision/beta/projects/example/README.md
+# TF Vision Example Project
+
+This is a minimal example project to demonstrate how to use TF Model Garden's
+building blocks to implement a new vision project from scratch.
+
+Below we use classification as an example. We will walk you through the process
+of creating a new projects leveraging existing components, such as tasks, data
+loaders, models, etc. You will get better understanding of these components by
+going through the process. You can also refer to the docstring of corresponding
+components to get more information.
+
+## Create Model
+
+In
+[example_model.py](example_model.py),
+we show how to create a new model. The `ExampleModel` is a subclass of
+`tf.keras.Model` that defines necessary parameters. Here, you need to have
+`input_specs` to specify the input shape and dimensions, and build layers within
+constructor:
+
+```python
+class ExampleModel(tf.keras.Model):
+  def __init__(
+    self,
+    num_classes: int,
+    input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec(
+        shape=[None, None, None, 3]),
+    **kwargs):
+    # Build layers.
+```
+
+Given the `ExampleModel`, you can define a function that takes a model config as
+input and return an `ExampleModel` instance, similar as
+[build_example_model](example_model.py#L80).
+As a simple example, we define a single model. However, you can split the model
+implementation to individual components, such as backbones, decoders, heads, as
+what we do
+[here](https://github.com/tensorflow/models/blob/master/official/vision/beta/modeling).
+And then in `build_example_model` function, you can hook up these components
+together to obtain your full model.
+
+## Create Dataloader
+
+A dataloader reads, decodes and parses the input data. We have created various
+[dataloaders](https://github.com/tensorflow/models/blob/master/official/vision/beta/dataloaders)
+to handle standard input formats for classification, detection and segmentation.
+If you have non-standard or complex data, you may want to create your own
+dataloader. It contains a `Decoder` and a `Parser`.
+
+-   The
+    [Decoder](example_input.py#L33)
+    decodes a TF Example record and returns a dictionary of decoded tensors:
+
+    ```python
+    class Decoder(decoder.Decoder):
+          """A tf.Example decoder for classification task."""
+          def __init__(self):
+            """Initializes the decoder.
+
+            The constructor defines the mapping between the field name and the value
+            from an input tf.Example. For example, we define two fields for image bytes
+            and labels. There is no limit on the number of fields to decode.
+            """
+            self._keys_to_features = {
+                'image/encoded':
+                    tf.io.FixedLenFeature((), tf.string, default_value=''),
+                'image/class/label':
+                    tf.io.FixedLenFeature((), tf.int64, default_value=-1)
+            }
+    ```
+
+-   The
+    [Parser](example_input.py#L68)
+    parses the decoded tensors and performs pre-processing to the input data,
+    such as image decoding, augmentation and resizing, etc. It should have
+    `_parse_train_data` and `_parse_eval_data` functions, in which the processed
+    images and labels are returned.
+
+## Create Config
+
+Next you will define configs for your project. All configs are defined as
+`dataclass` objects, and can have default parameter values.
+
+First, you will define your
+[`ExampleDataConfig`](example_config.py#L27).
+It inherits from `config_definitions.DataConfig` that already defines a few
+common fields, like `input_path`, `file_type`, `global_batch_size`, etc. You can
+add more fields in your own config as needed.
+
+You can then define you model config
+[`ExampleModel`](example_config.py#L39)
+that inherits from `hyperparams.Config`. Expose your own model parameters here.
+
+You can then define your `Loss` and `Evaluation` configs.
+
+Next, you will put all the above configs into an
+[`ExampleTask`](example_config.py#L56)
+config. Here you list the configs for your data, model, loss, and evaluation,
+etc.
+
+Finally, you can define a
+[`tf_vision_example_experiment`](example_config.py#L66),
+which creates a template for your experiments and fills with default parameters.
+These default parameter values can be overridden by a YAML file, like
+[example_config_tpu.yaml](example_config_tpu.yaml).
+Also, make sure you give a unique name to your experiment template by the
+decorator:
+
+```python
+@exp_factory.register_config_factory('tf_vision_example_experiment')
+def tf_vision_example_experiment() -> cfg.ExperimentConfig:
+  """Definition of a full example experiment."""
+  # Create and return experiment template.
+```
+
+## Create Task
+
+A task is a class that encapsules the logic of loading data, building models,
+performing one-step training and validation, etc. It connects all components
+together and is called by the base
+[Trainer](https://github.com/tensorflow/models/blob/master/official/core/base_trainer.py).
+
+You can create your own task by inheriting from base
+[Task](https://github.com/tensorflow/models/blob/master/official/core/base_task.py),
+or from one of the
+[tasks](https://github.com/tensorflow/models/blob/master/official/vision/beta/tasks/)
+we already defined, if most of the operations can be reused. An `ExampleTask`
+inheriting from
+[ImageClassificationTask](https://github.com/tensorflow/models/blob/master/official/vision/beta/tasks/image_classification.py#L32)
+can be found
+[here](example_task.py).
+We will go through each important components in the task in the following.
+
+-   `build_model`: you can instantiate a model you have defined above. It is
+    also good practice to run forward pass with a dummy input to ensure layers
+    within the model are properly initialized.
+
+-   `build_inputs`: here you can instantiate a Decoder object and a Parser
+    object. They are used to create an `InputReader` that will generate a
+    `tf.data.Dataset` object.
+
+-   `build_losses`: it takes groundtruth labels and model outputs as input, and
+    computes the loss. It will be called in `train_step` and `validation_step`.
+    You can also define different losses for training and validation, for
+    example, `build_train_losses` and `build_validation_losses`. Just make sure
+    they are called by the corresponding functions properly.
+
+-   `build_metrics`: here you can define your own metrics. It should return a
+    list of `tf.keras.metrics.Metric` objects. You can create your own metric
+    class by subclassing `tf.keras.metrics.Metric`.
+
+-   `train_step` and `validation_step`: they perform one-step training and
+    validation. They take one batch of training/validation data, run forward
+    pass, gather losses and update metrics. They assume the data format is
+    consistency with that from the `Parser` output. `train_step` also contains
+    backward pass to update model weights.
+
+## Import registry
+
+To use your custom dataloaders, models, tasks, etc., you will need to register
+them properly. The recommended way is to have a single file with all relevant
+files imported, for example,
+[registry_imports.py](registry_imports.py).
+You can see in this file we import all our custom components:
+
+```python
+# pylint: disable=unused-import
+from official.common import registry_imports
+from official.vision.beta.projects.example import example_config
+from official.vision.beta.projects.example import example_input
+from official.vision.beta.projects.example import example_model
+from official.vision.beta.projects.example import example_task
+```
+
+## Training
+
+You can create your own trainer by branching from our core
+[trainer](https://github.com/tensorflow/models/blob/master/official/vision/beta/train.py).
+Just make sure you import the registry like this:
+
+```python
+from official.vision.beta.projects.example import registry_imports  # pylint: disable=unused-import
+```
+
+You can run training locally for testing purpose:
+
+```bash
+# Assume you are under official/vision/beta/projects.
+python3 example/train.py \
+  --experiment=tf_vision_example_experiment \
+  --config_file=${PWD}/example/example_config_local.yaml \
+  --mode=train \
+  --model_dir=/tmp/tfvision_test/
+```
+
+It can also run on Google Cloud using Cloud TPU.
+[Here](https://cloud.google.com/tpu/docs/how-to) is the instruction of using
+Cloud TPU and here is a more detailed
+[tutorial](https://cloud.google.com/tpu/docs/tutorials/resnet-rs-2.x) of
+training a ResNet-RS model. Following the instructions to set up Cloud TPU and
+launch training by:
+
+```bash
+EXP_TYPE=tf_vision_example_experiment  # This should match the registered name of your experiment template.
+EXP_NAME=exp_001  # You can give any name to the experiment.
+TPU_NAME=experiment01
+# Now launch the experiment.
+python3 example/train.py \
+  --experiment=$EXP_TYPE \
+  --mode=train \
+  --tpu=$TPU_NAME \
+  --model_dir=/tmp/tfvision_test/
+  --config_file=third_party/tensorflow_models/official/vision/beta/projects/example/example_config_tpu.yaml
+```
--- a/official/vision/beta/projects/movinet/README.md
+++ b/official/vision/beta/projects/movinet/README.md
@@ -338,7 +338,7 @@ with the Python API:
 ```python
 # Create the interpreter and signature runner
 interpreter = tf.lite.Interpreter('/tmp/movinet_a0_stream.tflite')
-signature = interpreter.get_signature_runner()
+runner = interpreter.get_signature_runner()

 # Extract state names and create the initial (zero) states
 def state_name(name: str) -> str:
@@ -358,7 +358,7 @@ clips = tf.split(video, video.shape[1], axis=1)
 states = init_states
 for clip in clips:
  # Input shape: [1, 1, 172, 172, 3]
-  outputs = signature(**states, image=clip)
+  outputs = runner(**states, image=clip)
  logits = outputs.pop('logits')
  states = outputs
 ```

--- a/official/vision/beta/projects/movinet/export_saved_model_test.py
+++ b/official/vision/beta/projects/movinet/export_saved_model_test.py
@@ -121,7 +121,7 @@ class ExportSavedModelTest(tf.test.TestCase):
    tflite_model = converter.convert()

    interpreter = tf.lite.Interpreter(model_content=tflite_model)
-    signature = interpreter.get_signature_runner()
+    runner = interpreter.get_signature_runner('serving_default')

    def state_name(name: str) -> str:
      return name[len('serving_default_'):-len(':0')]
@@ -137,7 +137,7 @@ class ExportSavedModelTest(tf.test.TestCase):

    states = init_states
    for clip in clips:
-      outputs = signature(**states, image=clip)
+      outputs = runner(**states, image=clip)
      logits = outputs.pop('logits')
      states = outputs


--- a/official/vision/beta/projects/movinet/modeling/movinet.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet.py
@@ -17,10 +17,10 @@

 Reference: https://arxiv.org/pdf/2103.11511.pdf
 """
+import dataclasses
 import math
 from typing import Dict, Mapping, Optional, Sequence, Tuple, Union

-import dataclasses
 import tensorflow as tf

 from official.modeling import hyperparams
@@ -454,7 +454,7 @@ class Movinet(tf.keras.Model):
    stochastic_depth_idx = 1
    for block_idx, block in enumerate(self._block_specs):
      if isinstance(block, StemSpec):
-        x, states = movinet_layers.Stem(
+        layer_obj = movinet_layers.Stem(
            block.filters,
            block.kernel_size,
            block.strides,
@@ -466,9 +466,9 @@ class Movinet(tf.keras.Model):
            batch_norm_layer=self._norm,
            batch_norm_momentum=self._norm_momentum,
            batch_norm_epsilon=self._norm_epsilon,
-            state_prefix='state/stem',
-            name='stem')(
-                x, states=states)
+            state_prefix='state_stem',
+            name='stem')
+        x, states = layer_obj(x, states=states)
        endpoints['stem'] = x
      elif isinstance(block, MovinetBlockSpec):
        if not (len(block.expand_filters) == len(block.kernel_sizes) ==
@@ -486,8 +486,8 @@ class Movinet(tf.keras.Model):
              self._stochastic_depth_drop_rate * stochastic_depth_idx /
              num_layers)
          expand_filters, kernel_size, strides = layer
-          name = f'b{block_idx-1}/l{layer_idx}'
-          x, states = movinet_layers.MovinetBlock(
+          name = f'block{block_idx-1}_layer{layer_idx}'
+          layer_obj = movinet_layers.MovinetBlock(
              block.base_filters,
              expand_filters,
              kernel_size=kernel_size,
@@ -505,13 +505,14 @@ class Movinet(tf.keras.Model):
              batch_norm_layer=self._norm,
              batch_norm_momentum=self._norm_momentum,
              batch_norm_epsilon=self._norm_epsilon,
-              state_prefix=f'state/{name}',
-              name=name)(
-                  x, states=states)
+              state_prefix=f'state_{name}',
+              name=name)
+          x, states = layer_obj(x, states=states)
+
          endpoints[name] = x
          stochastic_depth_idx += 1
      elif isinstance(block, HeadSpec):
-        x, states = movinet_layers.Head(
+        layer_obj = movinet_layers.Head(
            project_filters=block.project_filters,
            conv_type=self._conv_type,
            activation=self._activation,
@@ -520,9 +521,9 @@ class Movinet(tf.keras.Model):
            batch_norm_layer=self._norm,
            batch_norm_momentum=self._norm_momentum,
            batch_norm_epsilon=self._norm_epsilon,
-            state_prefix='state/head',
-            name='head')(
-                x, states=states)
+            state_prefix='state_head',
+            name='head')
+        x, states = layer_obj(x, states=states)
        endpoints['head'] = x
      else:
        raise ValueError('Unknown block type {}'.format(block))
@@ -567,7 +568,7 @@ class Movinet(tf.keras.Model):
    for block_idx, block in enumerate(block_specs):
      if isinstance(block, StemSpec):
        if block.kernel_size[0] > 1:
-          states['state/stem/stream_buffer'] = (
+          states['state_stem_stream_buffer'] = (
              input_shape[0],
              input_shape[1],
              divide_resolution(input_shape[2], num_downsamples),
@@ -590,8 +591,10 @@ class Movinet(tf.keras.Model):
              self._conv_type in ['2plus1d', '3d_2plus1d']):
            num_downsamples += 1

+          prefix = f'state_block{block_idx}_layer{layer_idx}'
+
          if kernel_size[0] > 1:
-            states[f'state/b{block_idx}/l{layer_idx}/stream_buffer'] = (
+            states[f'{prefix}_stream_buffer'] = (
                input_shape[0],
                kernel_size[0] - 1,
                divide_resolution(input_shape[2], num_downsamples),
@@ -599,13 +602,13 @@ class Movinet(tf.keras.Model):
                expand_filters,
            )

-          states[f'state/b{block_idx}/l{layer_idx}/pool_buffer'] = (
+          states[f'{prefix}_pool_buffer'] = (
              input_shape[0], 1, 1, 1, expand_filters,
          )
-          states[f'state/b{block_idx}/l{layer_idx}/pool_frame_count'] = (1,)
+          states[f'{prefix}_pool_frame_count'] = (1,)

          if use_positional_encoding:
-            name = f'state/b{block_idx}/l{layer_idx}/pos_enc_frame_count'
+            name = f'{prefix}_pos_enc_frame_count'
            states[name] = (1,)

          if strides[1] != strides[2]:
@@ -618,10 +621,10 @@ class Movinet(tf.keras.Model):
              self._conv_type not in ['2plus1d', '3d_2plus1d']):
            num_downsamples += 1
      elif isinstance(block, HeadSpec):
-        states['state/head/pool_buffer'] = (
+        states['state_head_pool_buffer'] = (
            input_shape[0], 1, 1, 1, block.project_filters,
        )
-        states['state/head/pool_frame_count'] = (1,)
+        states['state_head_pool_frame_count'] = (1,)

    return states


--- a/official/vision/beta/projects/movinet/modeling/movinet_layers.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet_layers.py
@@ -478,7 +478,7 @@ class StreamBuffer(tf.keras.layers.Layer):

    state_prefix = state_prefix if state_prefix is not None else ''
    self._state_prefix = state_prefix
-    self._state_name = f'{state_prefix}/stream_buffer'
+    self._state_name = f'{state_prefix}_stream_buffer'
    self._buffer_size = buffer_size

  def get_config(self):
@@ -501,7 +501,7 @@ class StreamBuffer(tf.keras.layers.Layer):
      inputs: the input tensor.
      states: a dict of states such that, if any of the keys match for this
          layer, will overwrite the contents of the buffer(s).
-          Expected keys include `state_prefix + '/stream_buffer'`.
+          Expected keys include `state_prefix + '_stream_buffer'`.

    Returns:
      the output tensor and states

--- a/official/vision/beta/projects/movinet/modeling/movinet_test.py
+++ b/official/vision/beta/projects/movinet/modeling/movinet_test.py
@@ -35,11 +35,11 @@ class MoViNetTest(parameterized.TestCase, tf.test.TestCase):
    endpoints, states = network(inputs)

    self.assertAllEqual(endpoints['stem'].shape, [1, 8, 64, 64, 8])
-    self.assertAllEqual(endpoints['b0/l0'].shape, [1, 8, 32, 32, 8])
-    self.assertAllEqual(endpoints['b1/l0'].shape, [1, 8, 16, 16, 32])
-    self.assertAllEqual(endpoints['b2/l0'].shape, [1, 8, 8, 8, 56])
-    self.assertAllEqual(endpoints['b3/l0'].shape, [1, 8, 8, 8, 56])
-    self.assertAllEqual(endpoints['b4/l0'].shape, [1, 8, 4, 4, 104])
+    self.assertAllEqual(endpoints['block0_layer0'].shape, [1, 8, 32, 32, 8])
+    self.assertAllEqual(endpoints['block1_layer0'].shape, [1, 8, 16, 16, 32])
+    self.assertAllEqual(endpoints['block2_layer0'].shape, [1, 8, 8, 8, 56])
+    self.assertAllEqual(endpoints['block3_layer0'].shape, [1, 8, 8, 8, 56])
+    self.assertAllEqual(endpoints['block4_layer0'].shape, [1, 8, 4, 4, 104])
    self.assertAllEqual(endpoints['head'].shape, [1, 1, 1, 1, 480])

    self.assertNotEmpty(states)
@@ -59,11 +59,11 @@ class MoViNetTest(parameterized.TestCase, tf.test.TestCase):
    endpoints, new_states = backbone({**init_states, 'image': inputs})

    self.assertAllEqual(endpoints['stem'].shape, [1, 8, 64, 64, 8])
-    self.assertAllEqual(endpoints['b0/l0'].shape, [1, 8, 32, 32, 8])
-    self.assertAllEqual(endpoints['b1/l0'].shape, [1, 8, 16, 16, 32])
-    self.assertAllEqual(endpoints['b2/l0'].shape, [1, 8, 8, 8, 56])
-    self.assertAllEqual(endpoints['b3/l0'].shape, [1, 8, 8, 8, 56])
-    self.assertAllEqual(endpoints['b4/l0'].shape, [1, 8, 4, 4, 104])
+    self.assertAllEqual(endpoints['block0_layer0'].shape, [1, 8, 32, 32, 8])
+    self.assertAllEqual(endpoints['block1_layer0'].shape, [1, 8, 16, 16, 32])
+    self.assertAllEqual(endpoints['block2_layer0'].shape, [1, 8, 8, 8, 56])
+    self.assertAllEqual(endpoints['block3_layer0'].shape, [1, 8, 8, 8, 56])
+    self.assertAllEqual(endpoints['block4_layer0'].shape, [1, 8, 4, 4, 104])
    self.assertAllEqual(endpoints['head'].shape, [1, 1, 1, 1, 480])

    self.assertNotEmpty(init_states)

--- a/official/vision/beta/projects/simclr/common/registry_imports.py
+++ b/official/vision/beta/projects/simclr/common/registry_imports.py
@@ -12,20 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
 """All necessary imports for registration."""

 # pylint: disable=unused-import

--- a/official/vision/beta/projects/simclr/configs/experiments/imagenet_simclr_multitask_tpu.yaml
+++ b/official/vision/beta/projects/simclr/configs/experiments/imagenet_simclr_multitask_tpu.yaml
+runtime:
+  distribution_strategy: tpu
+  mixed_precision_dtype: 'bfloat16'
+task:
+  init_checkpoint: ''
+  model:
+    backbone:
+      resnet:
+        model_id: 50
+      type: resnet
+    projection_head:
+      ft_proj_idx: 1
+      num_proj_layers: 3
+      proj_output_dim: 128
+    backbone_trainable: true
+    heads: !!python/tuple
+    # Define heads for the PRETRAIN networks here
+    - task_name: pretrain_imagenet
+      mode: pretrain
+    # # Define heads for the FINETUNE networks here
+    - task_name: finetune_imagenet_10percent
+      mode: finetune
+      supervised_head:
+        num_classes: 1001
+        zero_init: true
+    input_size: [224, 224, 3]
+    l2_weight_decay: 0.0
+    norm_activation:
+      norm_epsilon: 1.0e-05
+      norm_momentum: 0.9
+      use_sync_bn: true
+  task_routines: !!python/tuple
+  # Define TASK CONFIG for the PRETRAIN networks here
+  - task_name: pretrain_imagenet
+    task_weight: 30.0
+    task_config:
+      evaluation:
+        one_hot: true
+        top_k: 5
+      loss:
+        l2_weight_decay: 0.0
+        projection_norm: true
+        temperature: 0.1
+      model:
+        input_size: [224, 224, 3]
+        mode: pretrain
+      train_data:
+        input_path: /readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/train*
+        input_set_label_to_zero: true    # Set labels to zeros to double confirm that no label is used during pretrain
+        is_training: true
+        global_batch_size: 4096
+        dtype: 'bfloat16'
+        parser:
+          aug_rand_hflip: true
+          mode: pretrain
+        decoder:
+          decode_label: true
+      validation_data:
+        input_path: /readahead/200M/placer/prod/home/distbelief/imagenet-tensorflow/imagenet-2012-tfrecord/valid*
+        is_training: false
+        global_batch_size: 2048
+        dtype: 'bfloat16'
+        drop_remainder: false
+        parser:
+          mode: pretrain
+        decoder:
+          decode_label: true
+  # Define TASK CONFIG for the FINETUNE Networks here
+  - task_name: finetune_imagenet_10percent
+    task_weight: 1.0
+    task_config:
+      evaluation:
+        one_hot: true
+        top_k: 5
+      loss:
+        l2_weight_decay: 0.0
+        label_smoothing: 0.0
+        one_hot: true
+      model:
+        input_size: [224, 224, 3]
+        mode: finetune
+        supervised_head:
+          num_classes: 1001
+          zero_init: true
+      train_data:
+        tfds_name: 'imagenet2012_subset/10pct'
+        tfds_split: 'train'
+        input_path: ''
+        is_training: true
+        global_batch_size: 1024
+        dtype: 'bfloat16'
+        parser:
+          aug_rand_hflip: true
+          mode: finetune
+        decoder:
+          decode_label: true
+      validation_data:
+        tfds_name: 'imagenet2012_subset/10pct'
+        tfds_split: 'validation'
+        input_path: ''
+        is_training: false
+        global_batch_size: 2048
+        dtype: 'bfloat16'
+        drop_remainder: false
+        parser:
+          mode: finetune
+        decoder:
+          decode_label: true
+trainer:
+  trainer_type: interleaving
+  task_sampler:
+    proportional:
+      alpha: 1.0
+    type: proportional
+  train_steps: 32000   # 100 epochs
+  validation_steps: 24  # NUM_EXAMPLES (50000) // global_batch_size
+  validation_interval: 625
+  steps_per_loop: 625  # NUM_EXAMPLES (1281167) // global_batch_size
+  summary_interval: 625
+  checkpoint_interval: 625
+  max_to_keep: 3
+  optimizer_config:
+    learning_rate:
+      cosine:
+        decay_steps: 32000
+        initial_learning_rate: 4.8
+      type: cosine
+    optimizer:
+      lars:
+        exclude_from_weight_decay: [batch_normalization, bias]
+        momentum: 0.9
+        weight_decay_rate: 1.0e-06
+      type: lars
+    warmup:
+      linear:
+        name: linear
+        warmup_steps: 3200
+      type: linear
--- a/official/vision/beta/projects/simclr/configs/multitask_config.py
+++ b/official/vision/beta/projects/simclr/configs/multitask_config.py
@@ -29,6 +29,7 @@ from official.vision.beta.projects.simclr.modeling import simclr_model
 @dataclasses.dataclass
 class SimCLRMTHeadConfig(hyperparams.Config):
  """Per-task specific configs."""
+  task_name: str = 'task_name'
  # Supervised head is required for finetune, but optional for pretrain.
  supervised_head: simclr_configs.SupervisedHead = simclr_configs.SupervisedHead(
      num_classes=1001)
@@ -57,14 +58,17 @@ def multitask_simclr() -> multitask_configs.MultiTaskExperimentConfig:
  return multitask_configs.MultiTaskExperimentConfig(
      task=multitask_configs.MultiTaskConfig(
          model=SimCLRMTModelConfig(
-              heads=(SimCLRMTHeadConfig(mode=simclr_model.PRETRAIN),
-                     SimCLRMTHeadConfig(mode=simclr_model.FINETUNE))),
+              heads=(SimCLRMTHeadConfig(
+                  task_name='pretrain_simclr', mode=simclr_model.PRETRAIN),
+                     SimCLRMTHeadConfig(
+                         task_name='finetune_simclr',
+                         mode=simclr_model.FINETUNE))),
          task_routines=(multitask_configs.TaskRoutine(
-              task_name=simclr_model.PRETRAIN,
+              task_name='pretrain_simclr',
              task_config=simclr_configs.SimCLRPretrainTask(),
              task_weight=2.0),
                         multitask_configs.TaskRoutine(
-                             task_name=simclr_model.FINETUNE,
+                             task_name='finetune_simclr',
                             task_config=simclr_configs.SimCLRFinetuneTask(),
                             task_weight=1.0))),
      trainer=multitask_configs.MultiTaskTrainerConfig())
--- a/official/vision/beta/projects/simclr/configs/simclr.py
+++ b/official/vision/beta/projects/simclr/configs/simclr.py
@@ -12,27 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
 """SimCLR configurations."""
+import dataclasses
 import os
 from typing import List, Optional

-import dataclasses
-
 from official.core import config_definitions as cfg
 from official.core import exp_factory
 from official.modeling import hyperparams
@@ -73,6 +57,9 @@ class DataConfig(cfg.DataConfig):
  # simclr specific configs
  parser: Parser = Parser()
  decoder: Decoder = Decoder()
+  # Useful when doing a sanity check that we absolutely use no labels while
+  # pretrain by setting labels to zeros (default = False, keep original labels)
+  input_set_label_to_zero: bool = False


 @dataclasses.dataclass
@@ -115,9 +102,7 @@ class SimCLRModel(hyperparams.Config):
  backbone: backbones.Backbone = backbones.Backbone(
      type='resnet', resnet=backbones.ResNet())
  projection_head: ProjectionHead = ProjectionHead(
-      proj_output_dim=128,
-      num_proj_layers=3,
-      ft_proj_idx=1)
+      proj_output_dim=128, num_proj_layers=3, ft_proj_idx=1)
  supervised_head: SupervisedHead = SupervisedHead(num_classes=1001)
  norm_activation: common.NormActivation = common.NormActivation(
      norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)
@@ -201,9 +186,7 @@ def simclr_pretraining_imagenet() -> cfg.ExperimentConfig:
              backbone=backbones.Backbone(
                  type='resnet', resnet=backbones.ResNet(model_id=50)),
              projection_head=ProjectionHead(
-                  proj_output_dim=128,
-                  num_proj_layers=3,
-                  ft_proj_idx=1),
+                  proj_output_dim=128, num_proj_layers=3, ft_proj_idx=1),
              supervised_head=SupervisedHead(num_classes=1001),
              norm_activation=common.NormActivation(
                  norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=True)),
@@ -233,10 +216,13 @@ def simclr_pretraining_imagenet() -> cfg.ExperimentConfig:
              'optimizer': {
                  'type': 'lars',
                  'lars': {
-                      'momentum': 0.9,
-                      'weight_decay_rate': 0.000001,
+                      'momentum':
+                          0.9,
+                      'weight_decay_rate':
+                          0.000001,
                      'exclude_from_weight_decay': [
-                          'batch_normalization', 'bias']
+                          'batch_normalization', 'bias'
+                      ]
                  }
              },
              'learning_rate': {
@@ -278,11 +264,8 @@ def simclr_finetuning_imagenet() -> cfg.ExperimentConfig:
              backbone=backbones.Backbone(
                  type='resnet', resnet=backbones.ResNet(model_id=50)),
              projection_head=ProjectionHead(
-                  proj_output_dim=128,
-                  num_proj_layers=3,
-                  ft_proj_idx=1),
-              supervised_head=SupervisedHead(
-                  num_classes=1001, zero_init=True),
+                  proj_output_dim=128, num_proj_layers=3, ft_proj_idx=1),
+              supervised_head=SupervisedHead(num_classes=1001, zero_init=True),
              norm_activation=common.NormActivation(
                  norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)),
          loss=ClassificationLosses(),
@@ -311,10 +294,13 @@ def simclr_finetuning_imagenet() -> cfg.ExperimentConfig:
              'optimizer': {
                  'type': 'lars',
                  'lars': {
-                      'momentum': 0.9,
-                      'weight_decay_rate': 0.0,
+                      'momentum':
+                          0.9,
+                      'weight_decay_rate':
+                          0.0,
                      'exclude_from_weight_decay': [
-                          'batch_normalization', 'bias']
+                          'batch_normalization', 'bias'
+                      ]
                  }
              },
              'learning_rate': {

--- a/official/vision/beta/projects/simclr/configs/simclr_test.py
+++ b/official/vision/beta/projects/simclr/configs/simclr_test.py
@@ -12,23 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for simclr."""
-# pylint: disable=unused-import
+"""Tests for SimCLR config."""
 from absl.testing import parameterized

 import tensorflow as tf

--- a/official/vision/beta/projects/simclr/dataloaders/preprocess_ops.py
+++ b/official/vision/beta/projects/simclr/dataloaders/preprocess_ops.py
@@ -12,20 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
 """Preprocessing ops."""
 import functools
 import tensorflow as tf

--- a/official/vision/beta/projects/simclr/dataloaders/simclr_input.py
+++ b/official/vision/beta/projects/simclr/dataloaders/simclr_input.py
@@ -12,20 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
 """Data parser and processing for SimCLR.

 For pre-training: