Internal change.

PiperOrigin-RevId: 277584854

Internal change.
PiperOrigin-RevId: 277584854
53eff257 · Yeqing Li · A. Unique TensorFlower · 9f3b0269 · 53eff257 · 53eff257
Commit 53eff257 authored Oct 30, 2019 by Yeqing Li Committed by A. Unique TensorFlower Oct 30, 2019
2 changed files
--- a/official/vision/detection/dataloader/maskrcnn_parser.py
+++ b/official/vision/detection/dataloader/maskrcnn_parser.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data parser and processing for Mask R-CNN."""
+import tensorflow.compat.v2 as tf
+from official.vision.detection.dataloader import anchor
+from official.vision.detection.dataloader import mode_keys as ModeKeys
+from official.vision.detection.dataloader import tf_example_decoder
+from official.vision.detection.utils import box_utils
+from official.vision.detection.utils import dataloader_utils
+from official.vision.detection.utils import input_utils
+class Parser(object):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+  def __init__(self,
+               output_size,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               rpn_match_threshold=0.7,
+               rpn_unmatched_threshold=0.3,
+               rpn_batch_size_per_im=256,
+               rpn_fg_fraction=0.5,
+               aug_rand_hflip=False,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               skip_crowd_during_training=True,
+               max_num_instances=100,
+               include_mask=False,
+               mask_crop_size=112,
+               use_bfloat16=True,
+               mode=None):
+    """Initializes parameters for parsing annotations in the dataset.
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      min_level: `int` number of minimum level of the output feature pyramid.
+      max_level: `int` number of maximum level of the output feature pyramid.
+      num_scales: `int` number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: `list` of float numbers representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: `float` number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      rpn_match_threshold:
+      rpn_unmatched_threshold:
+      rpn_batch_size_per_im:
+      rpn_fg_fraction:
+      aug_rand_hflip: `bool`, if True, augment training with random
+        horizontal flip.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
+        `is_crowd` equals to 1.
+      max_num_instances: `int` number of maximum number of instances in an
+        image. The groundtruth data will be padded to `max_num_instances`.
+      include_mask: a bool to indicate whether parse mask groundtruth.
+      mask_crop_size: the size which groundtruth mask is cropped to.
+      use_bfloat16: `bool`, if True, cast output image to tf.bfloat16.
+      mode: a ModeKeys. Specifies if this is training, evaluation, prediction
+        or prediction with groundtruths in the outputs.
+    """
+    self._mode = mode
+    self._max_num_instances = max_num_instances
+    self._skip_crowd_during_training = skip_crowd_during_training
+    self._is_training = (mode == ModeKeys.TRAIN)
+    self._example_decoder = tf_example_decoder.TfExampleDecoder(
+        include_mask=include_mask)
+    # Anchor.
+    self._output_size = output_size
+    self._min_level = min_level
+    self._max_level = max_level
+    self._num_scales = num_scales
+    self._aspect_ratios = aspect_ratios
+    self._anchor_size = anchor_size
+    # Target assigning.
+    self._rpn_match_threshold = rpn_match_threshold
+    self._rpn_unmatched_threshold = rpn_unmatched_threshold
+    self._rpn_batch_size_per_im = rpn_batch_size_per_im
+    self._rpn_fg_fraction = rpn_fg_fraction
+    # Data augmentation.
+    self._aug_rand_hflip = aug_rand_hflip
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+    # Mask.
+    self._include_mask = include_mask
+    self._mask_crop_size = mask_crop_size
+    # Device.
+    self._use_bfloat16 = use_bfloat16
+    # Data is parsed depending on the model Modekey.
+    if mode == ModeKeys.TRAIN:
+      self._parse_fn = self._parse_train_data
+    elif mode == ModeKeys.EVAL:
+      self._parse_fn = self._parse_eval_data
+    elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT:
+      self._parse_fn = self._parse_predict_data
+    else:
+      raise ValueError('mode is not defined.')
+  def __call__(self, value):
+    """Parses data to an image and associated training labels.
+    Args:
+      value: a string tensor holding a serialized tf.Example proto.
+    Returns:
+      image, labels: if mode == ModeKeys.TRAIN. see _parse_train_data.
+      {'images': image, 'labels': labels}: if mode == ModeKeys.PREDICT
+        or ModeKeys.PREDICT_WITH_GT.
+    """
+    with tf.name_scope('parser'):
+      data = self._example_decoder.decode(value)
+      return self._parse_fn(data)
+  def _parse_train_data(self, data):
+    """Parses data for training.
+    Args:
+      data: the decoded tensor dictionary from TfExampleDecoder.
+    Returns:
+      image: image tensor that is preproessed to have normalized value and
+        dimension [output_size[0], output_size[1], 3]
+      labels: a dictionary of tensors used for training. The following describes
+        {key: value} pairs in the dictionary.
+        image_info: a 2D `Tensor` that encodes the information of the image and
+          the applied preprocessing. It is in the format of
+          [[original_height, original_width], [scaled_height, scaled_width],
+        anchor_boxes: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, 4] representing anchor boxes at each level.
+        rpn_score_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location]. The height_l and
+          width_l represent the dimension of class logits at l-th level.
+        rpn_box_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
+          width_l represent the dimension of bounding box regression output at
+          l-th level.
+        gt_boxes: Groundtruth bounding box annotations. The box is represented
+           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
+           image that is fed to the network. The tennsor is padded with -1 to
+           the fixed dimension [self._max_num_instances, 4].
+        gt_classes: Groundtruth classes annotations. The tennsor is padded
+          with -1 to the fixed dimension [self._max_num_instances].
+        gt_masks: groundtrugh masks cropped by the bounding box and
+          resized to a fixed size determined by mask_crop_size.
+    """
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+    if self._include_mask:
+      masks = data['groundtruth_instance_masks']
+    is_crowds = data['groundtruth_is_crowd']
+    # Skips annotations with `is_crowd` = True.
+    if self._skip_crowd_during_training and self._is_training:
+      num_groundtrtuhs = tf.shape(classes)[0]
+      with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
+        indices = tf.cond(
+            tf.greater(tf.size(is_crowds), 0),
+            lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
+            lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
+      classes = tf.gather(classes, indices)
+      boxes = tf.gather(boxes, indices)
+      if self._include_mask:
+        masks = tf.gather(masks, indices)
+    # Gets original image and its size.
+    image = data['image']
+    image_shape = tf.shape(image)[0:2]
+    # Normalizes image with mean and std pixel values.
+    image = input_utils.normalize_image(image)
+    # Flips image randomly during training.
+    if self._aug_rand_hflip:
+      if self._include_mask:
+        image, boxes, masks = input_utils.random_horizontal_flip(
+            image, boxes, masks)
+      else:
+        image, boxes = input_utils.random_horizontal_flip(
+            image, boxes)
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    # Now the coordinates of boxes are w.r.t. the original image.
+    boxes = box_utils.denormalize_boxes(boxes, image_shape)
+    # Resizes and crops image.
+    image, image_info = input_utils.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=input_utils.compute_padded_size(
+            self._output_size, 2 ** self._max_level),
+        aug_scale_min=self._aug_scale_min,
+        aug_scale_max=self._aug_scale_max)
+    image_height, image_width, _ = image.get_shape().as_list()
+    # Resizes and crops boxes.
+    # Now the coordinates of boxes are w.r.t the scaled image.
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    boxes = input_utils.resize_and_crop_boxes(
+        boxes, image_scale, (image_height, image_width), offset)
+    if self._include_mask:
+      masks = input_utils.resize_and_crop_masks(
+          tf.expand_dims(masks, axis=-1),
+          image_scale, (image_height, image_width), offset)
+      masks = tf.squeeze(masks, axis=-1)
+    # Filters out ground truth boxes that are all zeros.
+    indices = input_utils.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+    if self._include_mask:
+      masks = tf.gather(masks, indices)
+      num_masks = tf.shape(masks)[0]
+      masks = tf.image.crop_and_resize(
+          tf.expand_dims(masks, axis=-1),
+          box_utils.normalize_boxes(boxes, tf.shape(image)[0:2]),
+          box_indices=tf.range(num_masks, dtype=tf.int32),
+          crop_size=[self._mask_crop_size, self._mask_crop_size],
+          method='bilinear')
+      masks = tf.squeeze(masks, axis=-1)
+    # Assigns anchor targets.
+    # Note that after the target assignment, box targets are absolute pixel
+    # offsets w.r.t. the scaled image.
+    input_anchor = anchor.Anchor(
+        self._min_level,
+        self._max_level,
+        self._num_scales,
+        self._aspect_ratios,
+        self._anchor_size,
+        (image_height, image_width))
+    anchor_labeler = anchor.RpnAnchorLabeler(
+        input_anchor,
+        self._rpn_match_threshold,
+        self._rpn_unmatched_threshold,
+        self._rpn_batch_size_per_im,
+        self._rpn_fg_fraction)
+    rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
+        boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))
+    # If bfloat16 is used, casts input image to tf.bfloat16.
+    if self._use_bfloat16:
+      image = tf.cast(image, dtype=tf.bfloat16)
+    # Packs labels for model_fn outputs.
+    labels = {
+        'anchor_boxes': input_anchor.multilevel_boxes,
+        'image_info': image_info,
+        'rpn_score_targets': rpn_score_targets,
+        'rpn_box_targets': rpn_box_targets,
+    }
+    labels['gt_boxes'] = input_utils.pad_to_fixed_size(
+        boxes, self._max_num_instances, -1)
+    labels['gt_classes'] = input_utils.pad_to_fixed_size(
+        classes, self._max_num_instances, -1)
+    if self._include_mask:
+      labels['gt_masks'] = input_utils.pad_to_fixed_size(
+          masks, self._max_num_instances, -1)
+    return image, labels
+  def _parse_eval_data(self, data):
+    """Parses data for evaluation."""
+    raise NotImplementedError('Not implemented!')
+  def _parse_predict_data(self, data):
+    """Parses data for prediction.
+    Args:
+      data: the decoded tensor dictionary from TfExampleDecoder.
+    Returns:
+      A dictionary of {'images': image, 'labels': labels} where
+        image: image tensor that is preproessed to have normalized value and
+          dimension [output_size[0], output_size[1], 3]
+        labels: a dictionary of tensors used for training. The following
+          describes {key: value} pairs in the dictionary.
+          source_ids: Source image id. Default value -1 if the source id is
+            empty in the groundtruth annotation.
+          image_info: a 2D `Tensor` that encodes the information of the image
+            and the applied preprocessing. It is in the format of
+            [[original_height, original_width], [scaled_height, scaled_width],
+          anchor_boxes: ordered dictionary with keys
+            [min_level, min_level+1, ..., max_level]. The values are tensor with
+            shape [height_l, width_l, 4] representing anchor boxes at each
+            level.
+    """
+    # Gets original image and its size.
+    image = data['image']
+    image_shape = tf.shape(image)[0:2]
+    # Normalizes image with mean and std pixel values.
+    image = input_utils.normalize_image(image)
+    # Resizes and crops image.
+    image, image_info = input_utils.resize_and_crop_image(
+        image,
+        self._output_size,
+        padded_size=input_utils.compute_padded_size(
+            self._output_size, 2 ** self._max_level),
+        aug_scale_min=1.0,
+        aug_scale_max=1.0)
+    image_height, image_width, _ = image.get_shape().as_list()
+    # If bfloat16 is used, casts input image to tf.bfloat16.
+    if self._use_bfloat16:
+      image = tf.cast(image, dtype=tf.bfloat16)
+    # Compute Anchor boxes.
+    input_anchor = anchor.Anchor(
+        self._min_level,
+        self._max_level,
+        self._num_scales,
+        self._aspect_ratios,
+        self._anchor_size,
+        (image_height, image_width))
+    labels = {
+        'source_id': dataloader_utils.process_source_id(data['source_id']),
+        'anchor_boxes': input_anchor.multilevel_boxes,
+        'image_info': image_info,
+    }
+    if self._mode == ModeKeys.PREDICT_WITH_GT:
+      # Converts boxes from normalized coordinates to pixel coordinates.
+      boxes = box_utils.denormalize_boxes(
+          data['groundtruth_boxes'], image_shape)
+      groundtruths = {
+          'source_id': data['source_id'],
+          'height': data['height'],
+          'width': data['width'],
+          'num_detections': tf.shape(data['groundtruth_classes']),
+          'boxes': boxes,
+          'classes': data['groundtruth_classes'],
+          'areas': data['groundtruth_area'],
+          'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
+      }
+      groundtruths['source_id'] = dataloader_utils.process_source_id(
+          groundtruths['source_id'])
+      groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
+          groundtruths, self._max_num_instances)
+      labels['groundtruths'] = groundtruths
+    return image, labels
--- a/official/vision/detection/dataloader/shapemask_parser.py
+++ b/official/vision/detection/dataloader/shapemask_parser.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data parser and processing.
+Parse image and ground truths in a dataset to training targets and package them
+into (image, labels) tuple for ShapeMask.
+Weicheng Kuo, Anelia Angelova, Jitendra Malik, Tsung-Yi Lin
+ShapeMask: Learning to Segment Novel Objects by Refining Shape Priors.
+arXiv:1904.03239.
+"""
+import tensorflow.compat.v2 as tf
+from official.vision.detection.dataloader import anchor
+from official.vision.detection.dataloader import mode_keys as ModeKeys
+from official.vision.detection.dataloader import tf_example_decoder
+from official.vision.detection.utils import box_utils
+from official.vision.detection.utils import class_utils
+from official.vision.detection.utils import dataloader_utils
+from official.vision.detection.utils import input_utils
+class Parser(object):
+  """Parser to parse an image and its annotations into a dictionary of tensors."""
+  def __init__(self,
+               output_size,
+               min_level,
+               max_level,
+               num_scales,
+               aspect_ratios,
+               anchor_size,
+               use_category=True,
+               outer_box_scale=1.0,
+               box_jitter_scale=0.025,
+               num_sampled_masks=8,
+               mask_crop_size=32,
+               mask_min_level=3,
+               mask_max_level=5,
+               upsample_factor=4,
+               match_threshold=0.5,
+               unmatched_threshold=0.5,
+               aug_rand_hflip=False,
+               aug_scale_min=1.0,
+               aug_scale_max=1.0,
+               skip_crowd_during_training=True,
+               max_num_instances=100,
+               use_bfloat16=True,
+               mask_train_class='all',
+               mode=None):
+    """Initializes parameters for parsing annotations in the dataset.
+    Args:
+      output_size: `Tensor` or `list` for [height, width] of output image. The
+        output_size should be divided by the largest feature stride 2^max_level.
+      min_level: `int` number of minimum level of the output feature pyramid.
+      max_level: `int` number of maximum level of the output feature pyramid.
+      num_scales: `int` number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds one additional
+        intermediate anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: `list` of float numbers representing the aspect raito
+        anchors added on each level. The number indicates the ratio of width to
+        height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
+        on each scale level.
+      anchor_size: `float` number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      use_category: if `False`, treat all object in all classes in one
+        foreground category.
+      outer_box_scale: `float` number in a range of [1.0, inf) representing
+        the scale from object box to outer box. The mask branch predicts
+        instance mask enclosed in outer box.
+      box_jitter_scale: `float` number representing the noise magnitude to
+        jitter the training groundtruth boxes for mask branch.
+      num_sampled_masks: `int` number of sampled masks for training.
+      mask_crop_size: `list` for [height, width] of output training masks.
+      mask_min_level: `int` number indicating the minimum feature level to
+        obtain instance features.
+      mask_max_level: `int` number indicating the maximum feature level to
+        obtain instance features.
+      upsample_factor: `int` factor of upsampling the fine mask predictions.
+      match_threshold: `float` number between 0 and 1 representing the
+        lower-bound threshold to assign positive labels for anchors. An anchor
+        with a score over the threshold is labeled positive.
+      unmatched_threshold: `float` number between 0 and 1 representing the
+        upper-bound threshold to assign negative labels for anchors. An anchor
+        with a score below the threshold is labeled negative.
+      aug_rand_hflip: `bool`, if True, augment training with random
+        horizontal flip.
+      aug_scale_min: `float`, the minimum scale applied to `output_size` for
+        data augmentation during training.
+      aug_scale_max: `float`, the maximum scale applied to `output_size` for
+        data augmentation during training.
+      skip_crowd_during_training: `bool`, if True, skip annotations labeled with
+        `is_crowd` equals to 1.
+      max_num_instances: `int` number of maximum number of instances in an
+        image. The groundtruth data will be padded to `max_num_instances`.
+      use_bfloat16: `bool`, if True, cast output image to tf.bfloat16.
+      mask_train_class: a string of experiment mode: `all`, `voc` or `nonvoc`.
+      mode: a ModeKeys. Specifies if this is training, evaluation, prediction
+        or prediction with groundtruths in the outputs.
+    """
+    self._mode = mode
+    self._mask_train_class = mask_train_class
+    self._max_num_instances = max_num_instances
+    self._skip_crowd_during_training = skip_crowd_during_training
+    self._is_training = (mode == ModeKeys.TRAIN)
+    self._example_decoder = tf_example_decoder.TfExampleDecoder(
+        include_mask=True)
+    # Anchor.
+    self._output_size = output_size
+    self._min_level = min_level
+    self._max_level = max_level
+    self._num_scales = num_scales
+    self._aspect_ratios = aspect_ratios
+    self._anchor_size = anchor_size
+    self._match_threshold = match_threshold
+    self._unmatched_threshold = unmatched_threshold
+    # Data augmentation.
+    self._aug_rand_hflip = aug_rand_hflip
+    self._aug_scale_min = aug_scale_min
+    self._aug_scale_max = aug_scale_max
+    # Device.
+    self._use_bfloat16 = use_bfloat16
+    # ShapeMask specific.
+    # Control of which category to use.
+    self._use_category = use_category
+    self._num_sampled_masks = num_sampled_masks
+    self._mask_crop_size = mask_crop_size
+    self._mask_min_level = mask_min_level
+    self._mask_max_level = mask_max_level
+    self._outer_box_scale = outer_box_scale
+    self._box_jitter_scale = box_jitter_scale
+    self._up_sample_factor = upsample_factor
+    # Data is parsed depending on the model Modekey.
+    if mode == ModeKeys.TRAIN:
+      self._parse_fn = self._parse_train_data
+    elif mode == ModeKeys.EVAL:
+      self._parse_fn = self._parse_eval_data
+    elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT:
+      self._parse_fn = self._parse_predict_data
+    else:
+      raise ValueError('mode is not defined.')
+  def __call__(self, value):
+    """Parses data to an image and associated training labels.
+    Args:
+      value: a string tensor holding a serialized tf.Example proto.
+    Returns:
+      image: image tensor that is preproessed to have normalized value and
+        dimension [output_size[0], output_size[1], 3]
+      labels:
+        cls_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location]. The height_l and
+          width_l represent the dimension of class logits at l-th level.
+        box_targets: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
+          width_l represent the dimension of bounding box regression output at
+          l-th level.
+        num_positives: number of positive anchors in the image.
+        anchor_boxes: ordered dictionary with keys
+          [min_level, min_level+1, ..., max_level]. The values are tensor with
+          shape [height_l, width_l, 4] representing anchor boxes at each level.
+        image_scale: 2D float `Tensor` representing scale factors that apply
+          to [height, width] of input image.
+        mask_boxes: sampled boxes that tightly enclose the training masks. The
+          box is represented in [y1, x1, y2, x2] format. The tensor is sampled
+          to the fixed dimension [self._num_sampled_masks, 4].
+        mask_outer_boxes: loose box that enclose sampled tight box. The
+          box is represented in [y1, x1, y2, x2] format. The tensor is sampled
+          to the fixed dimension [self._num_sampled_masks, 4].
+        mask_targets: training binary mask targets. The tensor has shape
+          [self._num_sampled_masks, self._mask_crop_size, self._mask_crop_size].
+        mask_classes: the class ids of sampled training masks. The tensor has
+          shape [self._num_sampled_masks].
+        mask_is_valid: the binary tensor to indicate if the sampled masks are
+          valide. The sampled masks are invalid when no mask annotations are
+          included in the image. The tensor has shape [1].
+        groundtruths:
+          source_id: source image id. Default value -1 if the source id is empty
+            in the groundtruth annotation.
+          boxes: groundtruth bounding box annotations. The box is represented in
+            [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed
+            dimension [self._max_num_instances, 4].
+          classes: groundtruth classes annotations. The tensor is padded with
+            -1 to the fixed dimension [self._max_num_instances].
+          areas: groundtruth areas annotations. The tensor is padded with -1
+            to the fixed dimension [self._max_num_instances].
+          is_crowds: groundtruth annotations to indicate if an annotation
+            represents a group of instances by value {0, 1}. The tensor is
+            padded with 0 to the fixed dimension [self._max_num_instances].
+    """
+    with tf.name_scope('parser'):
+      data = self._example_decoder.decode(value)
+      return self._parse_fn(data)
+  def _parse_train_data(self, data):
+    """Parse data for ShapeMask training."""
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+    masks = data['groundtruth_instance_masks']
+    is_crowds = data['groundtruth_is_crowd']
+    # Skips annotations with `is_crowd` = True.
+    if self._skip_crowd_during_training and self._is_training:
+      num_groundtrtuhs = tf.shape(classes)[0]
+      with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
+        indices = tf.cond(
+            tf.greater(tf.size(is_crowds), 0),
+            lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
+            lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
+      classes = tf.gather(classes, indices)
+      boxes = tf.gather(boxes, indices)
+      masks = tf.gather(masks, indices)
+    # Gets original image and its size.
+    image = data['image']
+    image_shape = tf.shape(image)[0:2]
+    # If not using category, makes all categories with id = 0.
+    if not self._use_category:
+      classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)
+    # Normalizes image with mean and std pixel values.
+    image = input_utils.normalize_image(image)
+    # Flips image randomly during training.
+    if self._aug_rand_hflip:
+      image, boxes, masks = input_utils.random_horizontal_flip(
+          image, boxes, masks)
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    boxes = box_utils.denormalize_boxes(boxes, image_shape)
+    # Resizes and crops image.
+    image, image_info = input_utils.resize_and_crop_image(
+        image,
+        self._output_size,
+        self._output_size,
+        aug_scale_min=self._aug_scale_min,
+        aug_scale_max=self._aug_scale_max)
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    # Resizes and crops boxes and masks.
+    boxes = input_utils.resize_and_crop_boxes(
+        boxes, image_scale, self._output_size, offset)
+    # Filters out ground truth boxes that are all zeros.
+    indices = input_utils.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+    masks = tf.gather(masks, indices)
+    # Assigns anchors.
+    input_anchor = anchor.Anchor(
+        self._min_level, self._max_level, self._num_scales,
+        self._aspect_ratios, self._anchor_size, self._output_size)
+    anchor_labeler = anchor.AnchorLabeler(
+        input_anchor, self._match_threshold, self._unmatched_threshold)
+    (cls_targets,
+     box_targets,
+     num_positives) = anchor_labeler.label_anchors(
+         boxes,
+         tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
+    # Sample groundtruth masks/boxes/classes for mask branch.
+    num_masks = tf.shape(masks)[0]
+    mask_shape = tf.shape(masks)[1:3]
+    # Pad sampled boxes/masks/classes to a constant batch size.
+    padded_boxes = input_utils.pad_to_fixed_size(boxes, self._num_sampled_masks)
+    padded_classes = input_utils.pad_to_fixed_size(
+        classes, self._num_sampled_masks)
+    padded_masks = input_utils.pad_to_fixed_size(masks, self._num_sampled_masks)
+    # Randomly sample groundtruth masks for mask branch training. For the image
+    # without groundtruth masks, it will sample the dummy padded tensors.
+    rand_indices = tf.random.shuffle(
+        tf.range(tf.maximum(num_masks, self._num_sampled_masks)))
+    rand_indices = tf.math.mod(rand_indices, tf.maximum(num_masks, 1))
+    rand_indices = rand_indices[0:self._num_sampled_masks]
+    rand_indices = tf.reshape(rand_indices, [self._num_sampled_masks])
+    sampled_boxes = tf.gather(padded_boxes, rand_indices)
+    sampled_classes = tf.gather(padded_classes, rand_indices)
+    sampled_masks = tf.gather(padded_masks, rand_indices)
+    # Jitter the sampled boxes to mimic the noisy detections.
+    sampled_boxes = box_utils.jitter_boxes(
+        sampled_boxes, noise_scale=self._box_jitter_scale)
+    sampled_boxes = box_utils.clip_boxes(sampled_boxes, self._output_size)
+    # Compute mask targets in feature crop. A feature crop fully contains a
+    # sampled box.
+    mask_outer_boxes = box_utils.compute_outer_boxes(
+        sampled_boxes, tf.shape(image)[0:2], scale=self._outer_box_scale)
+    mask_outer_boxes = box_utils.clip_boxes(mask_outer_boxes, self._output_size)
+    # Compensate the offset of mask_outer_boxes to map it back to original image
+    # scale.
+    mask_outer_boxes_ori = mask_outer_boxes
+    mask_outer_boxes_ori += tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
+    mask_outer_boxes_ori /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
+    norm_mask_outer_boxes_ori = box_utils.normalize_boxes(
+        mask_outer_boxes_ori, mask_shape)
+    # Set sampled_masks shape to [batch_size, height, width, 1].
+    sampled_masks = tf.cast(tf.expand_dims(sampled_masks, axis=-1), tf.float32)
+    mask_targets = tf.image.crop_and_resize(
+        sampled_masks,
+        norm_mask_outer_boxes_ori,
+        box_indices=tf.range(self._num_sampled_masks),
+        crop_size=[self._mask_crop_size, self._mask_crop_size],
+        method='bilinear',
+        extrapolation_value=0,
+        name='train_mask_targets')
+    mask_targets = tf.where(tf.greater_equal(mask_targets, 0.5),
+                            tf.ones_like(mask_targets),
+                            tf.zeros_like(mask_targets))
+    mask_targets = tf.squeeze(mask_targets, axis=-1)
+    if self._up_sample_factor > 1:
+      fine_mask_targets = tf.image.crop_and_resize(
+          sampled_masks,
+          norm_mask_outer_boxes_ori,
+          box_indices=tf.range(self._num_sampled_masks),
+          crop_size=[
+              self._mask_crop_size * self._up_sample_factor,
+              self._mask_crop_size * self._up_sample_factor
+          ],
+          method='bilinear',
+          extrapolation_value=0,
+          name='train_mask_targets')
+      fine_mask_targets = tf.where(
+          tf.greater_equal(fine_mask_targets, 0.5),
+          tf.ones_like(fine_mask_targets), tf.zeros_like(fine_mask_targets))
+      fine_mask_targets = tf.squeeze(fine_mask_targets, axis=-1)
+    else:
+      fine_mask_targets = mask_targets
+    # If bfloat16 is used, casts input image to tf.bfloat16.
+    if self._use_bfloat16:
+      image = tf.cast(image, dtype=tf.bfloat16)
+    valid_image = tf.cast(tf.not_equal(num_masks, 0), tf.int32)
+    if self._mask_train_class == 'all':
+      mask_is_valid = valid_image * tf.ones_like(sampled_classes, tf.int32)
+    else:
+      # Get the intersection of sampled classes with training splits.
+      mask_valid_classes = tf.cast(
+          tf.expand_dims(
+              class_utils.coco_split_class_ids(self._mask_train_class), 1),
+          sampled_classes.dtype)
+      match = tf.reduce_any(
+          tf.equal(tf.expand_dims(sampled_classes, 0), mask_valid_classes), 0)
+      mask_is_valid = valid_image * tf.cast(match, tf.int32)
+    # Packs labels for model_fn outputs.
+    labels = {
+        'cls_targets': cls_targets,
+        'box_targets': box_targets,
+        'anchor_boxes': input_anchor.multilevel_boxes,
+        'num_positives': num_positives,
+        'image_info': image_info,
+        # For ShapeMask.
+        'mask_boxes': sampled_boxes,
+        'mask_outer_boxes': mask_outer_boxes,
+        'mask_targets': mask_targets,
+        'fine_mask_targets': fine_mask_targets,
+        'mask_classes': sampled_classes,
+        'mask_is_valid': mask_is_valid,
+    }
+    return image, labels
+  def _parse_predict_data(self, data):
+    """Parse data for ShapeMask training."""
+    classes = data['groundtruth_classes']
+    boxes = data['groundtruth_boxes']
+    masks = data['groundtruth_instance_masks']
+    # Gets original image and its size.
+    image = data['image']
+    image_shape = tf.shape(image)[0:2]
+    # If not using category, makes all categories with id = 0.
+    if not self._use_category:
+      classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)
+    # Normalizes image with mean and std pixel values.
+    image = input_utils.normalize_image(image)
+    # Converts boxes from normalized coordinates to pixel coordinates.
+    boxes = box_utils.denormalize_boxes(boxes, image_shape)
+    # Resizes and crops image.
+    image, image_info = input_utils.resize_and_crop_image(
+        image,
+        self._output_size,
+        self._output_size,
+        aug_scale_min=1.0,
+        aug_scale_max=1.0)
+    image_scale = image_info[2, :]
+    offset = image_info[3, :]
+    # Resizes and crops boxes and masks.
+    boxes = input_utils.resize_and_crop_boxes(
+        boxes, image_scale, self._output_size, offset)
+    masks = input_utils.resize_and_crop_masks(
+        tf.expand_dims(masks, axis=-1), image_scale, self._output_size, offset)
+    # Filters out ground truth boxes that are all zeros.
+    indices = input_utils.get_non_empty_box_indices(boxes)
+    boxes = tf.gather(boxes, indices)
+    classes = tf.gather(classes, indices)
+    # Assigns anchors.
+    input_anchor = anchor.Anchor(
+        self._min_level, self._max_level, self._num_scales,
+        self._aspect_ratios, self._anchor_size, self._output_size)
+    anchor_labeler = anchor.AnchorLabeler(
+        input_anchor, self._match_threshold, self._unmatched_threshold)
+    # If bfloat16 is used, casts input image to tf.bfloat16.
+    if self._use_bfloat16:
+      image = tf.cast(image, dtype=tf.bfloat16)
+    labels = {
+        'anchor_boxes': input_anchor.multilevel_boxes,
+        'image_info': image_info,
+    }
+    if self._mode == ModeKeys.PREDICT_WITH_GT:
+      # Converts boxes from normalized coordinates to pixel coordinates.
+      groundtruths = {
+          'source_id': data['source_id'],
+          'num_detections': tf.shape(data['groundtruth_classes']),
+          'boxes': box_utils.denormalize_boxes(
+              data['groundtruth_boxes'], image_shape),
+          'classes': data['groundtruth_classes'],
+          # 'masks': tf.squeeze(masks, axis=-1),
+          'areas': data['groundtruth_area'],
+          'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
+      }
+      groundtruths['source_id'] = dataloader_utils.process_source_id(
+          groundtruths['source_id'])
+      groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
+          groundtruths, self._max_num_instances)
+      # Computes training labels.
+      (cls_targets,
+       box_targets,
+       num_positives) = anchor_labeler.label_anchors(
+           boxes,
+           tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
+      # Packs labels for model_fn outputs.
+      labels.update({
+          'cls_targets': cls_targets,
+          'box_targets': box_targets,
+          'num_positives': num_positives,
+          'groundtruths': groundtruths,
+      })
+    return image, labels