Internal change

PiperOrigin-RevId: 344335367

Internal change
PiperOrigin-RevId: 344335367
d4f37e87 · A. Unique TensorFlower · 51f4ecad · d4f37e87 · d4f37e87 · d4f37e87
Commit d4f37e87 authored Nov 25, 2020 by A. Unique TensorFlower
5 changed files
--- a/official/vision/detection/configs/factory.py
+++ b/official/vision/detection/configs/factory.py
@@ -16,6 +16,7 @@
 from official.modeling.hyperparams import params_dict
 from official.vision.detection.configs import maskrcnn_config
+from official.vision.detection.configs import olnmask_config
 from official.vision.detection.configs import retinanet_config
 from official.vision.detection.configs import shapemask_config
@@ -28,6 +29,9 @@ def config_generator(model):
  elif model == 'mask_rcnn':
    default_config = maskrcnn_config.MASKRCNN_CFG
    restrictions = maskrcnn_config.MASKRCNN_RESTRICTIONS
+  elif model == 'olnmask':
+    default_config = olnmask_config.OLNMASK_CFG
+    restrictions = olnmask_config.OLNMASK_RESTRICTIONS
  elif model == 'shapemask':
    default_config = shapemask_config.SHAPEMASK_CFG
    restrictions = shapemask_config.SHAPEMASK_RESTRICTIONS

--- a/official/vision/detection/configs/olnmask_config.py
+++ b/official/vision/detection/configs/olnmask_config.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Config template to train Object Localization Network (OLN)."""
+from official.modeling.hyperparams import params_dict
+from official.vision.detection.configs import base_config
+# pylint: disable=line-too-long
+OLNMASK_CFG = params_dict.ParamsDict(base_config.BASE_CFG)
+OLNMASK_CFG.override({
+    'type': 'olnmask',
+    'eval': {
+        'type': 'oln_xclass_box',
+        'use_category': False,
+        'seen_class': 'voc',
+        'num_images_to_visualize': 0,
+    },
+    'architecture': {
+        'parser': 'olnmask_parser',
+        'min_level': 2,
+        'max_level': 6,
+        'include_rpn_class': False,
+        'include_frcnn_class': False,
+        'include_frcnn_box': True,
+        'include_mask': False,
+        'mask_target_size': 28,
+        'num_classes': 2,
+    },
+    'olnmask_parser': {
+        'output_size': [640, 640],
+        'num_channels': 3,
+        'rpn_match_threshold': 0.7,
+        'rpn_unmatched_threshold': 0.3,
+        'rpn_batch_size_per_im': 256,
+        'rpn_fg_fraction': 0.5,
+        'aug_rand_hflip': True,
+        'aug_scale_min': 0.5,
+        'aug_scale_max': 2.0,
+        'skip_crowd_during_training': True,
+        'max_num_instances': 100,
+        'mask_crop_size': 112,
+        # centerness targets.
+        'has_centerness': True,
+        'rpn_center_match_iou_threshold': 0.3,
+        'rpn_center_unmatched_iou_threshold': 0.1,
+        'rpn_num_center_samples_per_im': 256,
+        # class manipulation.
+        'class_agnostic': True,
+        'train_class': 'voc',
+    },
+    'anchor': {
+        'num_scales': 1,
+        'aspect_ratios': [1.0],
+        'anchor_size': 8,
+    },
+    'rpn_head': {
+        'num_convs': 2,
+        'num_filters': 256,
+        'use_separable_conv': False,
+        'use_batch_norm': False,
+        # RPN-Centerness learning {
+        'has_centerness': True,  # }
+    },
+    'frcnn_head': {
+        'num_convs': 0,
+        'num_filters': 256,
+        'use_separable_conv': False,
+        'num_fcs': 2,
+        'fc_dims': 1024,
+        'use_batch_norm': False,
+        'has_scoring': True,
+    },
+    'mrcnn_head': {
+        'num_convs': 4,
+        'num_filters': 256,
+        'use_separable_conv': False,
+        'use_batch_norm': False,
+        'has_scoring': False,
+    },
+    'rpn_score_loss': {
+        'rpn_batch_size_per_im': 256,
+    },
+    'rpn_box_loss': {
+        'huber_loss_delta': 1.0 / 9.0,
+    },
+    'frcnn_box_loss': {
+        'huber_loss_delta': 1.0,
+    },
+    'frcnn_box_score_loss': {
+        'ignore_threshold': 0.3,
+    },
+    'roi_proposal': {
+        'rpn_pre_nms_top_k': 2000,
+        'rpn_post_nms_top_k': 2000,
+        'rpn_nms_threshold': 0.7,
+        'rpn_score_threshold': 0.0,
+        'rpn_min_size_threshold': 0.0,
+        'test_rpn_pre_nms_top_k': 2000,
+        'test_rpn_post_nms_top_k': 2000,
+        'test_rpn_nms_threshold': 0.7,
+        'test_rpn_score_threshold': 0.0,
+        'test_rpn_min_size_threshold': 0.0,
+        'use_batched_nms': False,
+    },
+    'roi_sampling': {
+        'num_samples_per_image': 512,
+        'fg_fraction': 0.25,
+        'fg_iou_thresh': 0.5,
+        'bg_iou_thresh_hi': 0.5,
+        'bg_iou_thresh_lo': 0.0,
+        'mix_gt_boxes': True,
+    },
+    'mask_sampling': {
+        'num_mask_samples_per_image': 128,  # Typically = `num_samples_per_image` * `fg_fraction`.
+    },
+    'postprocess': {
+        'use_batched_nms': False,
+        'max_total_size': 100,
+        'nms_iou_threshold': 0.5,
+        'score_threshold': 0.00,
+        'pre_nms_num_boxes': 2000,
+    },
+}, is_strict=False)
+OLNMASK_RESTRICTIONS = [
+    # 'anchor.aspect_ratios == [1.0]',
+    # 'anchor.scales == 1',
+]
+# pylint: enable=line-too-long
--- a/official/vision/detection/ops/postprocess_ops.py
+++ b/official/vision/detection/ops/postprocess_ops.py
@@ -407,3 +407,89 @@ class GenericDetectionGenerator(object):
    nmsed_classes += 1
    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+class OlnDetectionGenerator(GenericDetectionGenerator):
+  """Generates the final detected boxes with scores and classes."""
+  def __call__(self, box_outputs, class_outputs, anchor_boxes, image_shape,
+               is_single_fg_score=False, keep_nms=True):
+    """Generate final detections for Object Localization Network (OLN).
+    Args:
+      box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
+        representing the class-specific box coordinates relative to anchors.
+      class_outputs: a tensor of shape of [batch_size, K, num_classes]
+        representing the class logits before applying score activiation.
+      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
+        corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: a tensor of shape of [batch_size, 2] storing the image height
+        and width w.r.t. the scaled image, i.e. the same image space as
+        `box_outputs` and `anchor_boxes`.
+      is_single_fg_score: a Bool indicator of whether class_outputs includes the
+        background scores concatenated or not. By default, class_outputs is a
+        concatenation of both scores for the foreground and background. That is,
+        scores_without_bg=False.
+      keep_nms: a Bool indicator of whether to perform NMS or not.
+    Returns:
+      nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
+        representing top detected boxes in [y1, x1, y2, x2].
+      nms_scores: `float` Tensor of shape [batch_size, max_total_size]
+        representing sorted confidence scores for detected boxes. The values are
+        between [0, 1].
+      nms_classes: `int` Tensor of shape [batch_size, max_total_size]
+        representing classes for detected boxes.
+      valid_detections: `int` Tensor of shape [batch_size] only the top
+        `valid_detections` boxes are valid detections.
+    """
+    if is_single_fg_score:
+      # Concatenates dummy background scores.
+      dummy_bg_scores = tf.zeros_like(class_outputs)
+      class_outputs = tf.stack([dummy_bg_scores, class_outputs], -1)
+    else:
+      class_outputs = tf.nn.softmax(class_outputs, axis=-1)
+    # Removes the background class.
+    class_outputs_shape = tf.shape(class_outputs)
+    batch_size = class_outputs_shape[0]
+    num_locations = class_outputs_shape[1]
+    num_classes = class_outputs_shape[-1]
+    num_detections = num_locations * (num_classes - 1)
+    class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
+    box_outputs = tf.reshape(
+        box_outputs,
+        tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
+    box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
+    anchor_boxes = tf.tile(
+        tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
+    box_outputs = tf.reshape(box_outputs,
+                             tf.stack([batch_size, num_detections, 4], axis=-1))
+    anchor_boxes = tf.reshape(
+        anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
+    # Box decoding. For RPN outputs, box_outputs are all zeros.
+    decoded_boxes = box_utils.decode_boxes(
+        box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
+    # Box clipping
+    decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)
+    decoded_boxes = tf.reshape(
+        decoded_boxes,
+        tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
+    if keep_nms:
+      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+          self._generate_detections(decoded_boxes, class_outputs))
+      # Adds 1 to offset the background class which has index 0.
+      nmsed_classes += 1
+    else:
+      nmsed_boxes = decoded_boxes[:, :, 0, :]
+      nmsed_scores = class_outputs[:, :, 0]
+      nmsed_classes = tf.cast(tf.ones_like(nmsed_scores), tf.int32)
+      valid_detections = tf.cast(
+          tf.reduce_sum(tf.ones_like(nmsed_scores), axis=-1), tf.int32)
+    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
--- a/official/vision/detection/ops/roi_ops.py
+++ b/official/vision/detection/ops/roi_ops.py
@@ -231,3 +231,237 @@ class ROIGenerator(object):
        use_batched_nms=self._use_batched_nms,
        apply_sigmoid_to_score=True)
    return proposed_rois, proposed_roi_scores
+class OlnROIGenerator(ROIGenerator):
+  """Proposes RoIs for the second stage processing."""
+  def __call__(self, boxes, scores, anchor_boxes, image_shape, is_training,
+               is_box_lrtb=False, object_scores=None):
+    """Generates RoI proposals.
+    Args:
+      boxes: a dict with keys representing FPN levels and values representing
+        box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
+      scores: a dict with keys representing FPN levels and values representing
+        logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
+      anchor_boxes: a dict with keys representing FPN levels and values
+        representing anchor box tensors of shape [batch_size, feature_h,
+        feature_w, num_anchors * 4].
+      image_shape: a tensor of shape [batch_size, 2] where the last dimension
+        are [height, width] of the scaled image.
+      is_training: a bool indicating whether it is in training or inference
+        mode.
+      is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
+        bottom) format.
+      object_scores: another objectness score (e.g., centerness). In OLN, we use
+        object_scores=centerness as a replacement of the scores at each level.
+        A dict with keys representing FPN levels and values representing logit
+        tensors of shape [batch_size, feature_h, feature_w, num_anchors].
+    Returns:
+      proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
+        representing the box coordinates of the proposed RoIs w.r.t. the
+        scaled image.
+      proposed_roi_scores: a tensor of shape
+        [batch_size, rpn_post_nms_top_k, 1], representing the scores of the
+        proposed RoIs.
+    """
+    proposed_rois, proposed_roi_scores = self.oln_multilevel_propose_rois(
+        boxes,
+        scores,
+        anchor_boxes,
+        image_shape,
+        rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
+                           if is_training else self._test_rpn_pre_nms_top_k),
+        rpn_post_nms_top_k=(self._rpn_post_nms_top_k
+                            if is_training else self._test_rpn_post_nms_top_k),
+        rpn_nms_threshold=(self._rpn_nms_threshold
+                           if is_training else self._test_rpn_nms_threshold),
+        rpn_score_threshold=(self._rpn_score_threshold if is_training else
+                             self._test_rpn_score_threshold),
+        rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
+                                self._test_rpn_min_size_threshold),
+        decode_boxes=True,
+        clip_boxes=True,
+        use_batched_nms=self._use_batched_nms,
+        apply_sigmoid_to_score=True,
+        is_box_lrtb=is_box_lrtb,
+        rpn_object_scores=object_scores,)
+    return proposed_rois, proposed_roi_scores
+  def oln_multilevel_propose_rois(self,
+                                  rpn_boxes,
+                                  rpn_scores,
+                                  anchor_boxes,
+                                  image_shape,
+                                  rpn_pre_nms_top_k=2000,
+                                  rpn_post_nms_top_k=1000,
+                                  rpn_nms_threshold=0.7,
+                                  rpn_score_threshold=0.0,
+                                  rpn_min_size_threshold=0.0,
+                                  decode_boxes=True,
+                                  clip_boxes=True,
+                                  use_batched_nms=False,
+                                  apply_sigmoid_to_score=True,
+                                  is_box_lrtb=False,
+                                  rpn_object_scores=None,):
+    """Proposes RoIs given a group of candidates from different FPN levels.
+    The following describes the steps:
+      1. For each individual level:
+        a. Adjust scores for each level if specified by rpn_object_scores.
+        b. Apply sigmoid transform if specified.
+        c. Decode boxes (either of xyhw or left-right-top-bottom format) if
+          specified.
+        d. Clip boxes if specified.
+        e. Filter small boxes and those fall outside image if specified.
+        f. Apply pre-NMS filtering including pre-NMS top k and score
+           thresholding.
+        g. Apply NMS.
+      2. Aggregate post-NMS boxes from each level.
+      3. Apply an overall top k to generate the final selected RoIs.
+    Args:
+      rpn_boxes: a dict with keys representing FPN levels and values
+        representing box tenors of shape [batch_size, feature_h, feature_w,
+        num_anchors * 4].
+      rpn_scores: a dict with keys representing FPN levels and values
+        representing logit tensors of shape [batch_size, feature_h, feature_w,
+        num_anchors].
+      anchor_boxes: a dict with keys representing FPN levels and values
+        representing anchor box tensors of shape [batch_size, feature_h,
+        feature_w, num_anchors * 4].
+      image_shape: a tensor of shape [batch_size, 2] where the last dimension
+        are [height, width] of the scaled image.
+      rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
+        keep before applying NMS. Default: 2000.
+      rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
+        keep after applying NMS. Default: 1000.
+      rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
+        used for NMS. If 0.0, no NMS is applied. Default: 0.7.
+      rpn_score_threshold: a float between 0 and 1 representing the minimal box
+        score to keep before applying NMS. This is often used as a pre-filtering
+        step for better performance. If 0, no filtering is applied. Default: 0.
+      rpn_min_size_threshold: a float representing the minimal box size in each
+        side (w.r.t. the scaled image) to keep before applying NMS. This is
+        often used as a pre-filtering step for better performance. If 0, no
+        filtering is applied. Default: 0.
+      decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
+        using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
+        `anchor_boxes`. Default: True.
+      clip_boxes: a boolean indicating whether boxes are first clipped to the
+        scaled image size before appliying NMS. If False, no clipping is applied
+        and `image_shape` is ignored. Default: True.
+      use_batched_nms: a boolean indicating whether NMS is applied in batch
+        using `tf.image.combined_non_max_suppression`. Currently only available
+        in CPU/GPU. Default: False.
+      apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
+        `rpn_scores` before applying NMS. Default: True.
+      is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
+        bottom) format.
+      rpn_object_scores: a predicted objectness score (e.g., centerness). In
+        OLN, we use object_scores=centerness as a replacement of the scores at
+        each level. A dict with keys representing FPN levels and values
+        representing logit tensors of shape [batch_size, feature_h, feature_w,
+        num_anchors].
+    Returns:
+      selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
+        representing the box coordinates of the selected proposals w.r.t. the
+        scaled image.
+      selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k,
+      1],representing the scores of the selected proposals.
+    """
+    with tf.name_scope('multilevel_propose_rois'):
+      rois = []
+      roi_scores = []
+      image_shape = tf.expand_dims(image_shape, axis=1)
+      for level in sorted(rpn_scores.keys()):
+        with tf.name_scope('level_%d' % level):
+          _, feature_h, feature_w, num_anchors_per_location = (
+              rpn_scores[level].get_shape().as_list())
+          num_boxes = feature_h * feature_w * num_anchors_per_location
+          this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes])
+          this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4])
+          this_level_anchors = tf.cast(
+              tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
+              dtype=this_level_scores.dtype)
+          if rpn_object_scores:
+            this_level_object_scores = rpn_object_scores[level]
+            this_level_object_scores = tf.reshape(this_level_object_scores,
+                                                  [-1, num_boxes])
+            this_level_object_scores = tf.cast(this_level_object_scores,
+                                               this_level_scores.dtype)
+            this_level_scores = this_level_object_scores
+          if apply_sigmoid_to_score:
+            this_level_scores = tf.sigmoid(this_level_scores)
+          if decode_boxes:
+            if is_box_lrtb:  # Box in left-right-top-bottom format.
+              this_level_boxes = box_utils.decode_boxes_lrtb(
+                  this_level_boxes, this_level_anchors)
+            else:  # Box in standard x-y-h-w format.
+              this_level_boxes = box_utils.decode_boxes(
+                  this_level_boxes, this_level_anchors)
+          if clip_boxes:
+            this_level_boxes = box_utils.clip_boxes(
+                this_level_boxes, image_shape)
+          if rpn_min_size_threshold > 0.0:
+            this_level_boxes, this_level_scores = box_utils.filter_boxes(
+                this_level_boxes, this_level_scores, image_shape,
+                rpn_min_size_threshold)
+          this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
+          this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k)
+          if rpn_nms_threshold > 0.0:
+            if use_batched_nms:
+              this_level_rois, this_level_roi_scores, _, _ = (
+                  tf.image.combined_non_max_suppression(
+                      tf.expand_dims(this_level_boxes, axis=2),
+                      tf.expand_dims(this_level_scores, axis=-1),
+                      max_output_size_per_class=this_level_pre_nms_top_k,
+                      max_total_size=this_level_post_nms_top_k,
+                      iou_threshold=rpn_nms_threshold,
+                      score_threshold=rpn_score_threshold,
+                      pad_per_class=False,
+                      clip_boxes=False))
+            else:
+              if rpn_score_threshold > 0.0:
+                this_level_boxes, this_level_scores = (
+                    box_utils.filter_boxes_by_scores(this_level_boxes,
+                                                     this_level_scores,
+                                                     rpn_score_threshold))
+              this_level_boxes, this_level_scores = box_utils.top_k_boxes(
+                  this_level_boxes, this_level_scores,
+                  k=this_level_pre_nms_top_k)
+              this_level_roi_scores, this_level_rois = (
+                  nms.sorted_non_max_suppression_padded(
+                      this_level_scores,
+                      this_level_boxes,
+                      max_output_size=this_level_post_nms_top_k,
+                      iou_threshold=rpn_nms_threshold))
+          else:
+            this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
+                this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
+          rois.append(this_level_rois)
+          roi_scores.append(this_level_roi_scores)
+      all_rois = tf.concat(rois, axis=1)
+      all_roi_scores = tf.concat(roi_scores, axis=1)
+      with tf.name_scope('top_k_rois'):
+        _, num_valid_rois = all_roi_scores.get_shape().as_list()
+        overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)
+        selected_rois, selected_roi_scores = box_utils.top_k_boxes(
+            all_rois, all_roi_scores, k=overall_top_k)
+      return selected_rois, selected_roi_scores
--- a/official/vision/detection/ops/target_ops.py
+++ b/official/vision/detection/ops/target_ops.py
@@ -342,6 +342,180 @@ class ROISampler(object):
            sampled_gt_indices)
+class ROIScoreSampler(ROISampler):
+  """Samples RoIs, RoI-scores and creates training targets."""
+  def __call__(self, rois, roi_scores, gt_boxes, gt_classes):
+    """Sample and assign RoIs for training.
+    Args:
+      rois: a tensor of shape of [batch_size, N, 4]. N is the number of
+        proposals before groundtruth assignment. The last dimension is the box
+        coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
+      roi_scores:
+      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
+        coordinates of gt_boxes are in the pixel coordinates of the scaled
+        image. This tensor might have padding of values -1 indicating the
+        invalid box coordinates.
+      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
+        tensor might have paddings with values of -1 indicating the invalid
+        classes.
+    Returns:
+      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
+        coordinates of the sampled RoIs, where K is the number of the sampled
+        RoIs, i.e. K = num_samples_per_image.
+      sampled_roi_scores:
+      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
+        box coordinates of the matched groundtruth boxes of the samples RoIs.
+      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
+        classes of the matched groundtruth boxes of the sampled RoIs.
+    """
+    (sampled_rois, sampled_roi_scores, sampled_gt_boxes, sampled_gt_classes,
+     sampled_gt_indices) = (
+         self.assign_and_sample_proposals_and_scores(
+             rois,
+             roi_scores,
+             gt_boxes,
+             gt_classes,
+             num_samples_per_image=self._num_samples_per_image,
+             mix_gt_boxes=self._mix_gt_boxes,
+             fg_fraction=self._fg_fraction,
+             fg_iou_thresh=self._fg_iou_thresh,
+             bg_iou_thresh_hi=self._bg_iou_thresh_hi,
+             bg_iou_thresh_lo=self._bg_iou_thresh_lo))
+    return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
+            sampled_gt_classes, sampled_gt_indices)
+  def assign_and_sample_proposals_and_scores(self,
+                                             proposed_boxes,
+                                             proposed_scores,
+                                             gt_boxes,
+                                             gt_classes,
+                                             num_samples_per_image=512,
+                                             mix_gt_boxes=True,
+                                             fg_fraction=0.25,
+                                             fg_iou_thresh=0.5,
+                                             bg_iou_thresh_hi=0.5,
+                                             bg_iou_thresh_lo=0.0):
+    """Assigns the proposals with groundtruth classes and performs subsmpling.
+    Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
+    following algorithm to generate the final `num_samples_per_image` RoIs.
+      1. Calculates the IoU between each proposal box and each gt_boxes.
+      2. Assigns each proposed box with a groundtruth class and box by choosing
+         the largest IoU overlap.
+      3. Samples `num_samples_per_image` boxes from all proposed boxes, and
+         returns box_targets, class_targets, and RoIs.
+    Args:
+      proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number
+        of proposals before groundtruth assignment. The last dimension is the
+        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
+        format.
+      proposed_scores: a tensor of shape of [batch_size, N]. N is the number of
+        proposals before groundtruth assignment. It is the rpn scores for all
+        proposed boxes which can be either their classification or centerness
+        scores.
+      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
+        coordinates of gt_boxes are in the pixel coordinates of the scaled
+        image. This tensor might have padding of values -1 indicating the
+        invalid box coordinates.
+      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
+        tensor might have paddings with values of -1 indicating the invalid
+        classes.
+      num_samples_per_image: a integer represents RoI minibatch size per image.
+      mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes
+      before sampling proposals.
+      fg_fraction: a float represents the target fraction of RoI minibatch that
+        is labeled foreground (i.e., class > 0).
+      fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to
+        be considered foreground (if >= fg_iou_thresh).
+      bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI
+        to be considered background (class = 0 if overlap in [LO, HI)).
+      bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI
+        to be considered background (class = 0 if overlap in [LO, HI)).
+    Returns:
+      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
+        coordinates of the sampled RoIs, where K is the number of the sampled
+        RoIs, i.e. K = num_samples_per_image.
+      sampled_scores: a tensor of shape of [batch_size, K], representing the
+        confidence score of the sampled RoIs, where K is the number of the
+        sampled RoIs, i.e. K = num_samples_per_image.
+      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
+        box coordinates of the matched groundtruth boxes of the samples RoIs.
+      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
+        classes of the matched groundtruth boxes of the sampled RoIs.
+      sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
+        indices of the sampled groudntruth boxes in the original `gt_boxes`
+        tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] =
+        sampled_gt_boxes[:, i].
+    """
+    with tf.name_scope('sample_proposals_and_scores'):
+      if mix_gt_boxes:
+        boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
+        gt_scores = tf.ones_like(gt_boxes[:, :, 0])
+        scores = tf.concat([proposed_scores, gt_scores], axis=1)
+      else:
+        boxes = proposed_boxes
+        scores = proposed_scores
+      (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
+       _) = box_matching(boxes, gt_boxes, gt_classes)
+      positive_match = tf.greater(matched_iou, fg_iou_thresh)
+      negative_match = tf.logical_and(
+          tf.greater_equal(matched_iou, bg_iou_thresh_lo),
+          tf.less(matched_iou, bg_iou_thresh_hi))
+      ignored_match = tf.less(matched_iou, 0.0)
+      # re-assign negatively matched boxes to the background class.
+      matched_gt_classes = tf.where(negative_match,
+                                    tf.zeros_like(matched_gt_classes),
+                                    matched_gt_classes)
+      matched_gt_indices = tf.where(negative_match,
+                                    tf.zeros_like(matched_gt_indices),
+                                    matched_gt_indices)
+      sample_candidates = tf.logical_and(
+          tf.logical_or(positive_match, negative_match),
+          tf.logical_not(ignored_match))
+      sampler = (
+          balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
+              positive_fraction=fg_fraction, is_static=True))
+      batch_size, _ = sample_candidates.get_shape().as_list()
+      sampled_indicators = []
+      for i in range(batch_size):
+        sampled_indicator = sampler.subsample(sample_candidates[i],
+                                              num_samples_per_image,
+                                              positive_match[i])
+        sampled_indicators.append(sampled_indicator)
+      sampled_indicators = tf.stack(sampled_indicators)
+      _, sampled_indices = tf.nn.top_k(
+          tf.cast(sampled_indicators, dtype=tf.int32),
+          k=num_samples_per_image,
+          sorted=True)
+      sampled_indices_shape = tf.shape(sampled_indices)
+      batch_indices = (
+          tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
+          tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
+      gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)
+      sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
+      sampled_roi_scores = tf.gather_nd(scores, gather_nd_indices)
+      sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
+      sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
+      sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
+      return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
+              sampled_gt_classes, sampled_gt_indices)
 class MaskSampler(object):
  """Samples and creates mask training targets."""