Commit 7785dec0 authored by Yeqing Li's avatar Yeqing Li Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 425740068
parent 9c93f07c
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of ROI generator."""
from typing import Optional, Mapping
# Import libraries
import tensorflow as tf
from official.vision.ops import box_ops
from official.vision.ops import nms
def _multilevel_propose_rois(raw_boxes: Mapping[str, tf.Tensor],
raw_scores: Mapping[str, tf.Tensor],
anchor_boxes: Mapping[str, tf.Tensor],
image_shape: tf.Tensor,
pre_nms_top_k: int = 2000,
pre_nms_score_threshold: float = 0.0,
pre_nms_min_size_threshold: float = 0.0,
nms_iou_threshold: float = 0.7,
num_proposals: int = 1000,
use_batched_nms: bool = False,
decode_boxes: bool = True,
clip_boxes: bool = True,
apply_sigmoid_to_score: bool = True):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Apply sigmoid transform if specified.
b. Decode boxes if specified.
c. Clip boxes if specified.
d. Filter small boxes and those fall outside image if specified.
e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
f. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
raw_boxes: A `dict` with keys representing FPN levels and values
representing box tenors of shape
[batch_size, feature_h, feature_w, num_anchors * 4].
raw_scores: A `dict` with keys representing FPN levels and values
representing logit tensors of shape
[batch_size, feature_h, feature_w, num_anchors].
anchor_boxes: A `dict` with keys representing FPN levels and values
representing anchor box tensors of shape
[batch_size, feature_h * feature_w * num_anchors, 4].
image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension
are [height, width] of the scaled image.
pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep
before applying NMS. Default: 2000.
pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal
box score to keep before applying NMS. This is often used as a
pre-filtering step for better performance. Default: 0, no filtering is
applied.
pre_nms_min_size_threshold: A `float` representing the minimal box size in
each side (w.r.t. the scaled image) to keep before applying NMS. This is
often used as a pre-filtering step for better performance. Default: 0, no
filtering is applied.
nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold
used for NMS. If 0.0, no NMS is applied. Default: 0.7.
num_proposals: An `int` of top scoring RPN proposals *in total* to keep
after applying NMS. Default: 1000.
use_batched_nms: A `bool` indicating whether NMS is applied in batch using
`tf.image.combined_non_max_suppression`. Currently only available in
CPU/GPU. Default is False.
decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded
using `anchor_boxes`. If False, use `raw_boxes` directly and ignore
`anchor_boxes`. Default is True.
clip_boxes: A `bool` indicating whether boxes are first clipped to the
scaled image size before appliying NMS. If False, no clipping is applied
and `image_shape` is ignored. Default is True.
apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to
`raw_scores` before applying NMS. Default is True.
Returns:
selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4],
representing the box coordinates of the selected proposals w.r.t. the
scaled image.
selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1],
representing the scores of the selected proposals.
"""
with tf.name_scope('multilevel_propose_rois'):
rois = []
roi_scores = []
image_shape = tf.expand_dims(image_shape, axis=1)
for level in sorted(raw_scores.keys()):
with tf.name_scope('level_%s' % level):
_, feature_h, feature_w, num_anchors_per_location = (
raw_scores[level].get_shape().as_list())
num_boxes = feature_h * feature_w * num_anchors_per_location
this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes])
this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4])
this_level_anchors = tf.cast(
tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
dtype=this_level_scores.dtype)
if apply_sigmoid_to_score:
this_level_scores = tf.sigmoid(this_level_scores)
if decode_boxes:
this_level_boxes = box_ops.decode_boxes(
this_level_boxes, this_level_anchors)
if clip_boxes:
this_level_boxes = box_ops.clip_boxes(
this_level_boxes, image_shape)
if pre_nms_min_size_threshold > 0.0:
this_level_boxes, this_level_scores = box_ops.filter_boxes(
this_level_boxes,
this_level_scores,
image_shape,
pre_nms_min_size_threshold)
this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k)
this_level_post_nms_top_k = min(num_boxes, num_proposals)
if nms_iou_threshold > 0.0:
if use_batched_nms:
this_level_rois, this_level_roi_scores, _, _ = (
tf.image.combined_non_max_suppression(
tf.expand_dims(this_level_boxes, axis=2),
tf.expand_dims(this_level_scores, axis=-1),
max_output_size_per_class=this_level_pre_nms_top_k,
max_total_size=this_level_post_nms_top_k,
iou_threshold=nms_iou_threshold,
score_threshold=pre_nms_score_threshold,
pad_per_class=False,
clip_boxes=False))
else:
if pre_nms_score_threshold > 0.0:
this_level_boxes, this_level_scores = (
box_ops.filter_boxes_by_scores(
this_level_boxes,
this_level_scores,
pre_nms_score_threshold))
this_level_boxes, this_level_scores = box_ops.top_k_boxes(
this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
this_level_roi_scores, this_level_rois = (
nms.sorted_non_max_suppression_padded(
this_level_scores,
this_level_boxes,
max_output_size=this_level_post_nms_top_k,
iou_threshold=nms_iou_threshold))
else:
this_level_rois, this_level_roi_scores = box_ops.top_k_boxes(
this_level_boxes,
this_level_scores,
k=this_level_post_nms_top_k)
rois.append(this_level_rois)
roi_scores.append(this_level_roi_scores)
all_rois = tf.concat(rois, axis=1)
all_roi_scores = tf.concat(roi_scores, axis=1)
with tf.name_scope('top_k_rois'):
_, num_valid_rois = all_roi_scores.get_shape().as_list()
overall_top_k = min(num_valid_rois, num_proposals)
selected_rois, selected_roi_scores = box_ops.top_k_boxes(
all_rois, all_roi_scores, k=overall_top_k)
return selected_rois, selected_roi_scores
@tf.keras.utils.register_keras_serializable(package='Vision')
class MultilevelROIGenerator(tf.keras.layers.Layer):
"""Proposes RoIs for the second stage processing."""
def __init__(self,
pre_nms_top_k: int = 2000,
pre_nms_score_threshold: float = 0.0,
pre_nms_min_size_threshold: float = 0.0,
nms_iou_threshold: float = 0.7,
num_proposals: int = 1000,
test_pre_nms_top_k: int = 1000,
test_pre_nms_score_threshold: float = 0.0,
test_pre_nms_min_size_threshold: float = 0.0,
test_nms_iou_threshold: float = 0.7,
test_num_proposals: int = 1000,
use_batched_nms: bool = False,
**kwargs):
"""Initializes a ROI generator.
The ROI generator transforms the raw predictions from RPN to ROIs.
Args:
pre_nms_top_k: An `int` of the number of top scores proposals to be kept
before applying NMS.
pre_nms_score_threshold: A `float` of the score threshold to apply before
applying NMS. Proposals whose scores are below this threshold are
thrown away.
pre_nms_min_size_threshold: A `float` of the threshold of each side of the
box (w.r.t. the scaled image). Proposals whose sides are below this
threshold are thrown away.
nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold.
num_proposals: An `int` of the final number of proposals to generate.
test_pre_nms_top_k: An `int` of the number of top scores proposals to be
kept before applying NMS in testing.
test_pre_nms_score_threshold: A `float` of the score threshold to apply
before applying NMS in testing. Proposals whose scores are below this
threshold are thrown away.
test_pre_nms_min_size_threshold: A `float` of the threshold of each side
of the box (w.r.t. the scaled image) in testing. Proposals whose sides
are below this threshold are thrown away.
test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in
testing.
test_num_proposals: An `int` of the final number of proposals to generate
in testing.
use_batched_nms: A `bool` of whether or not use
`tf.image.combined_non_max_suppression`.
**kwargs: Additional keyword arguments passed to Layer.
"""
self._config_dict = {
'pre_nms_top_k': pre_nms_top_k,
'pre_nms_score_threshold': pre_nms_score_threshold,
'pre_nms_min_size_threshold': pre_nms_min_size_threshold,
'nms_iou_threshold': nms_iou_threshold,
'num_proposals': num_proposals,
'test_pre_nms_top_k': test_pre_nms_top_k,
'test_pre_nms_score_threshold': test_pre_nms_score_threshold,
'test_pre_nms_min_size_threshold': test_pre_nms_min_size_threshold,
'test_nms_iou_threshold': test_nms_iou_threshold,
'test_num_proposals': test_num_proposals,
'use_batched_nms': use_batched_nms,
}
super(MultilevelROIGenerator, self).__init__(**kwargs)
def call(self,
raw_boxes: Mapping[str, tf.Tensor],
raw_scores: Mapping[str, tf.Tensor],
anchor_boxes: Mapping[str, tf.Tensor],
image_shape: tf.Tensor,
training: Optional[bool] = None):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Apply sigmoid transform if specified.
b. Decode boxes if specified.
c. Clip boxes if specified.
d. Filter small boxes and those fall outside image if specified.
e. Apply pre-NMS filtering including pre-NMS top k and score
thresholding.
f. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
raw_boxes: A `dict` with keys representing FPN levels and values
representing box tenors of shape
[batch, feature_h, feature_w, num_anchors * 4].
raw_scores: A `dict` with keys representing FPN levels and values
representing logit tensors of shape
[batch, feature_h, feature_w, num_anchors].
anchor_boxes: A `dict` with keys representing FPN levels and values
representing anchor box tensors of shape
[batch, feature_h * feature_w * num_anchors, 4].
image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension
are [height, width] of the scaled image.
training: A `bool` that indicates whether it is in training mode.
Returns:
roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed
ROIs in the scaled image coordinate.
roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the
proposed ROIs.
"""
roi_boxes, roi_scores = _multilevel_propose_rois(
raw_boxes,
raw_scores,
anchor_boxes,
image_shape,
pre_nms_top_k=(
self._config_dict['pre_nms_top_k'] if training
else self._config_dict['test_pre_nms_top_k']),
pre_nms_score_threshold=(
self._config_dict['pre_nms_score_threshold'] if training
else self._config_dict['test_pre_nms_score_threshold']),
pre_nms_min_size_threshold=(
self._config_dict['pre_nms_min_size_threshold'] if training
else self._config_dict['test_pre_nms_min_size_threshold']),
nms_iou_threshold=(
self._config_dict['nms_iou_threshold'] if training
else self._config_dict['test_nms_iou_threshold']),
num_proposals=(
self._config_dict['num_proposals'] if training
else self._config_dict['test_num_proposals']),
use_batched_nms=self._config_dict['use_batched_nms'],
decode_boxes=True,
clip_boxes=True,
apply_sigmoid_to_score=True)
return roi_boxes, roi_scores
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for roi_generator.py."""
# Import libraries
import numpy as np
import tensorflow as tf
from official.vision.modeling.layers import roi_generator
class MultilevelProposeRoisTest(tf.test.TestCase):
def test_multilevel_propose_rois_single_level(self):
rpn_boxes_np = np.array(
[[[[0, 0, 10, 10], [0.01, 0.01, 9.9, 9.9]],
[[5, 5, 10, 10], [2, 2, 8, 8]]],
[[[2, 2, 4, 4], [3, 3, 6, 6]],
[[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
rpn_boxes = {
'2': tf.constant(rpn_boxes_np, dtype=tf.float32)
}
rpn_scores_np = np.array(
[[[[0.6], [0.9]], [[0.2], [0.3]]], [[[0.1], [0.8]], [[0.3], [0.5]]]])
rpn_scores = {
'2': tf.constant(rpn_scores_np, dtype=tf.float32)
}
anchor_boxes_np = np.array(
[[[[0, 0, 10, 10], [0.01, 0.01, 9.9, 9.9]],
[[5, 5, 10, 10], [2, 2, 8, 8]]],
[[[2, 2, 4, 4], [3, 3, 6, 6]],
[[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
anchor_boxes = {
'2': tf.constant(anchor_boxes_np, dtype=tf.float32)
}
image_shape = tf.constant([[20, 20], [20, 20]], dtype=tf.int32)
selected_rois_np = np.array(
[[[0.01, 0.01, 9.9, 9.9], [2, 2, 8, 8], [5, 5, 10, 10], [0, 0, 0, 0]],
[[3, 3, 6, 6], [1, 1, 8, 8], [2, 2, 4, 4], [0, 0, 0, 0]]])
selected_roi_scores_np = np.array(
[[0.9, 0.3, 0.2, 0], [0.8, 0.5, 0.1, 0]])
# Runs on TPU.
strategy = tf.distribute.TPUStrategy()
with strategy.scope():
selected_rois_tpu, selected_roi_scores_tpu = (
roi_generator._multilevel_propose_rois(
rpn_boxes,
rpn_scores,
anchor_boxes=anchor_boxes,
image_shape=image_shape,
pre_nms_top_k=4,
pre_nms_score_threshold=0.0,
pre_nms_min_size_threshold=0.0,
nms_iou_threshold=0.5,
num_proposals=4,
use_batched_nms=False,
decode_boxes=False,
clip_boxes=False,
apply_sigmoid_to_score=False))
# Runs on CPU.
selected_rois_cpu, selected_roi_scores_cpu = (
roi_generator._multilevel_propose_rois(
rpn_boxes,
rpn_scores,
anchor_boxes=anchor_boxes,
image_shape=image_shape,
pre_nms_top_k=4,
pre_nms_score_threshold=0.0,
pre_nms_min_size_threshold=0.0,
nms_iou_threshold=0.5,
num_proposals=4,
use_batched_nms=False,
decode_boxes=False,
clip_boxes=False,
apply_sigmoid_to_score=False))
self.assertNDArrayNear(
selected_rois_tpu.numpy(), selected_rois_cpu.numpy(), 1e-5)
self.assertNDArrayNear(
selected_roi_scores_tpu.numpy(), selected_roi_scores_cpu.numpy(), 1e-5)
self.assertNDArrayNear(
selected_rois_tpu.numpy(), selected_rois_np, 1e-5)
self.assertNDArrayNear(
selected_roi_scores_tpu.numpy(), selected_roi_scores_np, 1e-5)
def test_multilevel_propose_rois_two_levels(self):
rpn_boxes_1_np = np.array(
[[[[0, 0, 10, 10], [0.01, 0.01, 9.99, 9.99]],
[[5, 5, 10, 10], [2, 2, 8, 8]]],
[[[2, 2, 2.5, 2.5], [3, 3, 6, 6]],
[[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
rpn_boxes_2_np = np.array(
[[[[0, 0, 10.01, 10.01]]], [[[2, 2, 4.5, 4.5]]]])
rpn_boxes = {
'2': tf.constant(rpn_boxes_1_np, dtype=tf.float32),
'3': tf.constant(rpn_boxes_2_np, dtype=tf.float32),
}
rpn_scores_1_np = np.array(
[[[[0.6], [0.9]], [[0.2], [0.3]]], [[[0.1], [0.8]], [[0.3], [0.5]]]])
rpn_scores_2_np = np.array([[[[0.95]]], [[[0.99]]]])
rpn_scores = {
'2': tf.constant(rpn_scores_1_np, dtype=tf.float32),
'3': tf.constant(rpn_scores_2_np, dtype=tf.float32),
}
anchor_boxes_1_np = np.array(
[[[[0, 0, 10, 10], [0.01, 0.01, 9.99, 9.99]],
[[5, 5, 10, 10], [2, 2, 8, 8]]],
[[[2, 2, 2.5, 2.5], [3, 3, 6, 6]],
[[3.1, 3.1, 6.1, 6.1], [1, 1, 8, 8]]]])
anchor_boxes_2_np = np.array(
[[[[0, 0, 10.01, 10.01]]], [[[2, 2, 4.5, 4.5]]]])
anchor_boxes = {
'2': tf.constant(anchor_boxes_1_np, dtype=tf.float32),
'3': tf.constant(anchor_boxes_2_np, dtype=tf.float32),
}
image_shape = tf.constant([[20, 20], [20, 20]], dtype=tf.int32)
selected_rois_np = np.array(
[[[0, 0, 10.01, 10.01], [0.01, 0.01, 9.99, 9.99]],
[[2, 2, 4.5, 4.5], [3, 3, 6, 6]]])
selected_roi_scores_np = np.array([[0.95, 0.9], [0.99, 0.8]])
# Runs on TPU.
strategy = tf.distribute.TPUStrategy()
with strategy.scope():
selected_rois_tpu, selected_roi_scores_tpu = (
roi_generator._multilevel_propose_rois(
rpn_boxes,
rpn_scores,
anchor_boxes=anchor_boxes,
image_shape=image_shape,
pre_nms_top_k=4,
pre_nms_score_threshold=0.0,
pre_nms_min_size_threshold=0.0,
nms_iou_threshold=0.5,
num_proposals=2,
use_batched_nms=False,
decode_boxes=False,
clip_boxes=False,
apply_sigmoid_to_score=False))
# Runs on CPU.
selected_rois_cpu, selected_roi_scores_cpu = (
roi_generator._multilevel_propose_rois(
rpn_boxes,
rpn_scores,
anchor_boxes=anchor_boxes,
image_shape=image_shape,
pre_nms_top_k=4,
pre_nms_score_threshold=0.0,
pre_nms_min_size_threshold=0.0,
nms_iou_threshold=0.5,
num_proposals=2,
use_batched_nms=False,
decode_boxes=False,
clip_boxes=False,
apply_sigmoid_to_score=False))
self.assertNDArrayNear(
selected_rois_tpu.numpy(), selected_rois_cpu.numpy(), 1e-5)
self.assertNDArrayNear(
selected_roi_scores_tpu.numpy(), selected_roi_scores_cpu.numpy(), 1e-5)
self.assertNDArrayNear(
selected_rois_tpu.numpy(), selected_rois_np, 1e-5)
self.assertNDArrayNear(
selected_roi_scores_tpu.numpy(), selected_roi_scores_np, 1e-5)
class MultilevelROIGeneratorTest(tf.test.TestCase):
def test_serialize_deserialize(self):
kwargs = dict(
pre_nms_top_k=2000,
pre_nms_score_threshold=0.0,
pre_nms_min_size_threshold=0.0,
nms_iou_threshold=0.7,
num_proposals=1000,
test_pre_nms_top_k=1000,
test_pre_nms_score_threshold=0.0,
test_pre_nms_min_size_threshold=0.0,
test_nms_iou_threshold=0.7,
test_num_proposals=1000,
use_batched_nms=False,
)
generator = roi_generator.MultilevelROIGenerator(**kwargs)
expected_config = dict(kwargs)
self.assertEqual(generator.get_config(), expected_config)
new_generator = roi_generator.MultilevelROIGenerator.from_config(
generator.get_config())
self.assertAllEqual(generator.get_config(), new_generator.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains definitions of ROI sampler."""
# Import libraries
import tensorflow as tf
from official.vision.modeling.layers import box_sampler
from official.vision.ops import box_matcher
from official.vision.ops import iou_similarity
from official.vision.ops import target_gather
@tf.keras.utils.register_keras_serializable(package='Vision')
class ROISampler(tf.keras.layers.Layer):
"""Samples ROIs and assigns targets to the sampled ROIs."""
def __init__(self,
mix_gt_boxes: bool = True,
num_sampled_rois: int = 512,
foreground_fraction: float = 0.25,
foreground_iou_threshold: float = 0.5,
background_iou_high_threshold: float = 0.5,
background_iou_low_threshold: float = 0,
skip_subsampling: bool = False,
**kwargs):
"""Initializes a ROI sampler.
Args:
mix_gt_boxes: A `bool` of whether to mix the groundtruth boxes with
proposed ROIs.
num_sampled_rois: An `int` of the number of sampled ROIs per image.
foreground_fraction: A `float` in [0, 1], what percentage of proposed ROIs
should be sampled from the foreground boxes.
foreground_iou_threshold: A `float` that represents the IoU threshold for
a box to be considered as positive (if >= `foreground_iou_threshold`).
background_iou_high_threshold: A `float` that represents the IoU threshold
for a box to be considered as negative (if overlap in
[`background_iou_low_threshold`, `background_iou_high_threshold`]).
background_iou_low_threshold: A `float` that represents the IoU threshold
for a box to be considered as negative (if overlap in
[`background_iou_low_threshold`, `background_iou_high_threshold`])
skip_subsampling: a bool that determines if we want to skip the sampling
procedure than balances the fg/bg classes. Used for upper frcnn layers
in cascade RCNN.
**kwargs: Additional keyword arguments passed to Layer.
"""
self._config_dict = {
'mix_gt_boxes': mix_gt_boxes,
'num_sampled_rois': num_sampled_rois,
'foreground_fraction': foreground_fraction,
'foreground_iou_threshold': foreground_iou_threshold,
'background_iou_high_threshold': background_iou_high_threshold,
'background_iou_low_threshold': background_iou_low_threshold,
'skip_subsampling': skip_subsampling,
}
self._sim_calc = iou_similarity.IouSimilarity()
self._box_matcher = box_matcher.BoxMatcher(
thresholds=[
background_iou_low_threshold, background_iou_high_threshold,
foreground_iou_threshold
],
indicators=[-3, -1, -2, 1])
self._target_gather = target_gather.TargetGather()
self._sampler = box_sampler.BoxSampler(
num_sampled_rois, foreground_fraction)
super(ROISampler, self).__init__(**kwargs)
def call(self, boxes: tf.Tensor, gt_boxes: tf.Tensor, gt_classes: tf.Tensor):
"""Assigns the proposals with groundtruth classes and performs subsmpling.
Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
following algorithm to generate the final `num_samples_per_image` RoIs.
1. Calculates the IoU between each proposal box and each gt_boxes.
2. Assigns each proposed box with a groundtruth class and box by choosing
the largest IoU overlap.
3. Samples `num_samples_per_image` boxes from all proposed boxes, and
returns box_targets, class_targets, and RoIs.
Args:
boxes: A `tf.Tensor` of shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment. The last dimension is the
box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
format.
gt_boxes: A `tf.Tensor` of shape of [batch_size, MAX_NUM_INSTANCES, 4].
The coordinates of gt_boxes are in the pixel coordinates of the scaled
image. This tensor might have padding of values -1 indicating the
invalid box coordinates.
gt_classes: A `tf.Tensor` with a shape of [batch_size, MAX_NUM_INSTANCES].
This tensor might have paddings with values of -1 indicating the invalid
classes.
Returns:
sampled_rois: A `tf.Tensor` of shape of [batch_size, K, 4], representing
the coordinates of the sampled RoIs, where K is the number of the
sampled RoIs, i.e. K = num_samples_per_image.
sampled_gt_boxes: A `tf.Tensor` of shape of [batch_size, K, 4], storing
the box coordinates of the matched groundtruth boxes of the samples
RoIs.
sampled_gt_classes: A `tf.Tensor` of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
sampled_gt_indices: A `tf.Tensor` of shape of [batch_size, K], storing the
indices of the sampled groudntruth boxes in the original `gt_boxes`
tensor, i.e.,
gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
"""
gt_boxes = tf.cast(gt_boxes, dtype=boxes.dtype)
if self._config_dict['mix_gt_boxes']:
boxes = tf.concat([boxes, gt_boxes], axis=1)
boxes_invalid_mask = tf.less(
tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0)
gt_invalid_mask = tf.less(
tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
similarity_matrix = self._sim_calc(boxes, gt_boxes, boxes_invalid_mask,
gt_invalid_mask)
matched_gt_indices, match_indicators = self._box_matcher(similarity_matrix)
positive_matches = tf.greater_equal(match_indicators, 0)
negative_matches = tf.equal(match_indicators, -1)
ignored_matches = tf.equal(match_indicators, -2)
invalid_matches = tf.equal(match_indicators, -3)
background_mask = tf.expand_dims(
tf.logical_or(negative_matches, invalid_matches), -1)
gt_classes = tf.expand_dims(gt_classes, axis=-1)
matched_gt_classes = self._target_gather(gt_classes, matched_gt_indices,
background_mask)
matched_gt_classes = tf.where(background_mask,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_boxes = self._target_gather(gt_boxes, matched_gt_indices,
tf.tile(background_mask, [1, 1, 4]))
matched_gt_boxes = tf.where(background_mask,
tf.zeros_like(matched_gt_boxes),
matched_gt_boxes)
matched_gt_indices = tf.where(
tf.squeeze(background_mask, -1), -tf.ones_like(matched_gt_indices),
matched_gt_indices)
if self._config_dict['skip_subsampling']:
return (boxes, matched_gt_boxes, tf.squeeze(matched_gt_classes,
axis=-1), matched_gt_indices)
sampled_indices = self._sampler(
positive_matches, negative_matches, ignored_matches)
sampled_rois = self._target_gather(boxes, sampled_indices)
sampled_gt_boxes = self._target_gather(matched_gt_boxes, sampled_indices)
sampled_gt_classes = tf.squeeze(self._target_gather(
matched_gt_classes, sampled_indices), axis=-1)
sampled_gt_indices = tf.squeeze(self._target_gather(
tf.expand_dims(matched_gt_indices, -1), sampled_indices), axis=-1)
return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
sampled_gt_indices)
def get_config(self):
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for roi_sampler.py."""
# Import libraries
import numpy as np
import tensorflow as tf
from official.vision.modeling.layers import roi_sampler
class ROISamplerTest(tf.test.TestCase):
def test_roi_sampler(self):
boxes_np = np.array(
[[[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5],
[5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]])
boxes = tf.constant(boxes_np, dtype=tf.float32)
gt_boxes_np = np.array(
[[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5],
[-1, -1, -1, -1]]])
gt_boxes = tf.constant(gt_boxes_np, dtype=tf.float32)
gt_classes_np = np.array([[2, 10, -1]])
gt_classes = tf.constant(gt_classes_np, dtype=tf.int32)
generator = roi_sampler.ROISampler(
mix_gt_boxes=True,
num_sampled_rois=2,
foreground_fraction=0.5,
foreground_iou_threshold=0.5,
background_iou_high_threshold=0.5,
background_iou_low_threshold=0.0)
# Runs on TPU.
strategy = tf.distribute.TPUStrategy()
with strategy.scope():
_ = generator(boxes, gt_boxes, gt_classes)
# Runs on CPU.
_ = generator(boxes, gt_boxes, gt_classes)
def test_serialize_deserialize(self):
kwargs = dict(
mix_gt_boxes=True,
num_sampled_rois=512,
foreground_fraction=0.25,
foreground_iou_threshold=0.5,
background_iou_high_threshold=0.5,
background_iou_low_threshold=0.5,
skip_subsampling=False,
)
generator = roi_sampler.ROISampler(**kwargs)
expected_config = dict(kwargs)
self.assertEqual(generator.get_config(), expected_config)
new_generator = roi_sampler.ROISampler.from_config(
generator.get_config())
self.assertAllEqual(generator.get_config(), new_generator.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""R-CNN(-RS) models."""
from typing import Any, List, Mapping, Optional, Tuple, Union
import tensorflow as tf
from official.vision.ops import anchor
from official.vision.ops import box_ops
@tf.keras.utils.register_keras_serializable(package='Vision')
class MaskRCNNModel(tf.keras.Model):
"""The Mask R-CNN(-RS) and Cascade RCNN-RS models."""
def __init__(self,
backbone: tf.keras.Model,
decoder: tf.keras.Model,
rpn_head: tf.keras.layers.Layer,
detection_head: Union[tf.keras.layers.Layer,
List[tf.keras.layers.Layer]],
roi_generator: tf.keras.layers.Layer,
roi_sampler: Union[tf.keras.layers.Layer,
List[tf.keras.layers.Layer]],
roi_aligner: tf.keras.layers.Layer,
detection_generator: tf.keras.layers.Layer,
mask_head: Optional[tf.keras.layers.Layer] = None,
mask_sampler: Optional[tf.keras.layers.Layer] = None,
mask_roi_aligner: Optional[tf.keras.layers.Layer] = None,
class_agnostic_bbox_pred: bool = False,
cascade_class_ensemble: bool = False,
min_level: Optional[int] = None,
max_level: Optional[int] = None,
num_scales: Optional[int] = None,
aspect_ratios: Optional[List[float]] = None,
anchor_size: Optional[float] = None,
**kwargs):
"""Initializes the R-CNN(-RS) model.
Args:
backbone: `tf.keras.Model`, the backbone network.
decoder: `tf.keras.Model`, the decoder network.
rpn_head: the RPN head.
detection_head: the detection head or a list of heads.
roi_generator: the ROI generator.
roi_sampler: a single ROI sampler or a list of ROI samplers for cascade
detection heads.
roi_aligner: the ROI aligner.
detection_generator: the detection generator.
mask_head: the mask head.
mask_sampler: the mask sampler.
mask_roi_aligner: the ROI alginer for mask prediction.
class_agnostic_bbox_pred: if True, perform class agnostic bounding box
prediction. Needs to be `True` for Cascade RCNN models.
cascade_class_ensemble: if True, ensemble classification scores over all
detection heads.
min_level: Minimum level in output feature maps.
max_level: Maximum level in output feature maps.
num_scales: A number representing intermediate scales added on each level.
For instances, num_scales=2 adds one additional intermediate anchor
scales [2^0, 2^0.5] on each level.
aspect_ratios: A list representing the aspect raito anchors added on each
level. The number indicates the ratio of width to height. For instances,
aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level.
anchor_size: A number representing the scale of size of the base anchor to
the feature stride 2^level.
**kwargs: keyword arguments to be passed.
"""
super(MaskRCNNModel, self).__init__(**kwargs)
self._config_dict = {
'backbone': backbone,
'decoder': decoder,
'rpn_head': rpn_head,
'detection_head': detection_head,
'roi_generator': roi_generator,
'roi_sampler': roi_sampler,
'roi_aligner': roi_aligner,
'detection_generator': detection_generator,
'mask_head': mask_head,
'mask_sampler': mask_sampler,
'mask_roi_aligner': mask_roi_aligner,
'class_agnostic_bbox_pred': class_agnostic_bbox_pred,
'cascade_class_ensemble': cascade_class_ensemble,
'min_level': min_level,
'max_level': max_level,
'num_scales': num_scales,
'aspect_ratios': aspect_ratios,
'anchor_size': anchor_size,
}
self.backbone = backbone
self.decoder = decoder
self.rpn_head = rpn_head
if not isinstance(detection_head, (list, tuple)):
self.detection_head = [detection_head]
else:
self.detection_head = detection_head
self.roi_generator = roi_generator
if not isinstance(roi_sampler, (list, tuple)):
self.roi_sampler = [roi_sampler]
else:
self.roi_sampler = roi_sampler
if len(self.roi_sampler) > 1 and not class_agnostic_bbox_pred:
raise ValueError(
'`class_agnostic_bbox_pred` needs to be True if multiple detection heads are specified.'
)
self.roi_aligner = roi_aligner
self.detection_generator = detection_generator
self._include_mask = mask_head is not None
self.mask_head = mask_head
if self._include_mask and mask_sampler is None:
raise ValueError('`mask_sampler` is not provided in Mask R-CNN.')
self.mask_sampler = mask_sampler
if self._include_mask and mask_roi_aligner is None:
raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.')
self.mask_roi_aligner = mask_roi_aligner
# Weights for the regression losses for each FRCNN layer.
# TODO(xianzhi): Make the weights configurable.
self._cascade_layer_to_weights = [
[10.0, 10.0, 5.0, 5.0],
[20.0, 20.0, 10.0, 10.0],
[30.0, 30.0, 15.0, 15.0],
]
def call(self,
images: tf.Tensor,
image_shape: tf.Tensor,
anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
gt_boxes: Optional[tf.Tensor] = None,
gt_classes: Optional[tf.Tensor] = None,
gt_masks: Optional[tf.Tensor] = None,
training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
model_outputs, intermediate_outputs = self._call_box_outputs(
images=images, image_shape=image_shape, anchor_boxes=anchor_boxes,
gt_boxes=gt_boxes, gt_classes=gt_classes, training=training)
if not self._include_mask:
return model_outputs
model_mask_outputs = self._call_mask_outputs(
model_box_outputs=model_outputs,
features=model_outputs['decoder_features'],
current_rois=intermediate_outputs['current_rois'],
matched_gt_indices=intermediate_outputs['matched_gt_indices'],
matched_gt_boxes=intermediate_outputs['matched_gt_boxes'],
matched_gt_classes=intermediate_outputs['matched_gt_classes'],
gt_masks=gt_masks,
training=training)
model_outputs.update(model_mask_outputs)
return model_outputs
def _get_backbone_and_decoder_features(self, images):
backbone_features = self.backbone(images)
if self.decoder:
features = self.decoder(backbone_features)
else:
features = backbone_features
return backbone_features, features
def _call_box_outputs(
self, images: tf.Tensor,
image_shape: tf.Tensor,
anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
gt_boxes: Optional[tf.Tensor] = None,
gt_classes: Optional[tf.Tensor] = None,
training: Optional[bool] = None) -> Tuple[
Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]:
"""Implementation of the Faster-RCNN logic for boxes."""
model_outputs = {}
# Feature extraction.
(backbone_features,
decoder_features) = self._get_backbone_and_decoder_features(images)
# Region proposal network.
rpn_scores, rpn_boxes = self.rpn_head(decoder_features)
model_outputs.update({
'backbone_features': backbone_features,
'decoder_features': decoder_features,
'rpn_boxes': rpn_boxes,
'rpn_scores': rpn_scores
})
# Generate anchor boxes for this batch if not provided.
if anchor_boxes is None:
_, image_height, image_width, _ = images.get_shape().as_list()
anchor_boxes = anchor.Anchor(
min_level=self._config_dict['min_level'],
max_level=self._config_dict['max_level'],
num_scales=self._config_dict['num_scales'],
aspect_ratios=self._config_dict['aspect_ratios'],
anchor_size=self._config_dict['anchor_size'],
image_size=(image_height, image_width)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0),
[tf.shape(images)[0], 1, 1, 1])
# Generate RoIs.
current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes,
image_shape, training)
next_rois = current_rois
all_class_outputs = []
for cascade_num in range(len(self.roi_sampler)):
# In cascade RCNN we want the higher layers to have different regression
# weights as the predicted deltas become smaller and smaller.
regression_weights = self._cascade_layer_to_weights[cascade_num]
current_rois = next_rois
(class_outputs, box_outputs, model_outputs, matched_gt_boxes,
matched_gt_classes, matched_gt_indices,
current_rois) = self._run_frcnn_head(
features=decoder_features,
rois=current_rois,
gt_boxes=gt_boxes,
gt_classes=gt_classes,
training=training,
model_outputs=model_outputs,
cascade_num=cascade_num,
regression_weights=regression_weights)
all_class_outputs.append(class_outputs)
# Generate ROIs for the next cascade head if there is any.
if cascade_num < len(self.roi_sampler) - 1:
next_rois = box_ops.decode_boxes(
tf.cast(box_outputs, tf.float32),
current_rois,
weights=regression_weights)
next_rois = box_ops.clip_boxes(next_rois,
tf.expand_dims(image_shape, axis=1))
if not training:
if self._config_dict['cascade_class_ensemble']:
class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs)
detections = self.detection_generator(
box_outputs,
class_outputs,
current_rois,
image_shape,
regression_weights,
bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred']))
model_outputs.update({
'cls_outputs': class_outputs,
'box_outputs': box_outputs,
})
if self.detection_generator.get_config()['apply_nms']:
model_outputs.update({
'detection_boxes': detections['detection_boxes'],
'detection_scores': detections['detection_scores'],
'detection_classes': detections['detection_classes'],
'num_detections': detections['num_detections']
})
else:
model_outputs.update({
'decoded_boxes': detections['decoded_boxes'],
'decoded_box_scores': detections['decoded_box_scores']
})
intermediate_outputs = {
'matched_gt_boxes': matched_gt_boxes,
'matched_gt_indices': matched_gt_indices,
'matched_gt_classes': matched_gt_classes,
'current_rois': current_rois,
}
return (model_outputs, intermediate_outputs)
def _call_mask_outputs(
self,
model_box_outputs: Mapping[str, tf.Tensor],
features: tf.Tensor,
current_rois: tf.Tensor,
matched_gt_indices: tf.Tensor,
matched_gt_boxes: tf.Tensor,
matched_gt_classes: tf.Tensor,
gt_masks: tf.Tensor,
training: Optional[bool] = None) -> Mapping[str, tf.Tensor]:
"""Implementation of Mask-RCNN mask prediction logic."""
model_outputs = dict(model_box_outputs)
if training:
current_rois, roi_classes, roi_masks = self.mask_sampler(
current_rois, matched_gt_boxes, matched_gt_classes,
matched_gt_indices, gt_masks)
roi_masks = tf.stop_gradient(roi_masks)
model_outputs.update({
'mask_class_targets': roi_classes,
'mask_targets': roi_masks,
})
else:
current_rois = model_outputs['detection_boxes']
roi_classes = model_outputs['detection_classes']
mask_logits, mask_probs = self._features_to_mask_outputs(
features, current_rois, roi_classes)
if training:
model_outputs.update({
'mask_outputs': mask_logits,
})
else:
model_outputs.update({
'detection_masks': mask_probs,
})
return model_outputs
def _run_frcnn_head(self, features, rois, gt_boxes, gt_classes, training,
model_outputs, cascade_num, regression_weights):
"""Runs the frcnn head that does both class and box prediction.
Args:
features: `list` of features from the feature extractor.
rois: `list` of current rois that will be used to predict bbox refinement
and classes from.
gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4].
This tensor might have paddings with a negative value.
gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
classes. It is padded with -1s to indicate the invalid classes.
training: `bool`, if model is training or being evaluated.
model_outputs: `dict`, used for storing outputs used for eval and losses.
cascade_num: `int`, the current frcnn layer in the cascade.
regression_weights: `list`, weights used for l1 loss in bounding box
regression.
Returns:
class_outputs: Class predictions for rois.
box_outputs: Box predictions for rois. These are formatted for the
regression loss and need to be converted before being used as rois
in the next stage.
model_outputs: Updated dict with predictions used for losses and eval.
matched_gt_boxes: If `is_training` is true, then these give the gt box
location of its positive match.
matched_gt_classes: If `is_training` is true, then these give the gt class
of the predicted box.
matched_gt_boxes: If `is_training` is true, then these give the box
location of its positive match.
matched_gt_indices: If `is_training` is true, then gives the index of
the positive box match. Used for mask prediction.
rois: The sampled rois used for this layer.
"""
# Only used during training.
matched_gt_boxes, matched_gt_classes, matched_gt_indices = (None, None,
None)
if training and gt_boxes is not None:
rois = tf.stop_gradient(rois)
current_roi_sampler = self.roi_sampler[cascade_num]
rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
current_roi_sampler(rois, gt_boxes, gt_classes))
# Create bounding box training targets.
box_targets = box_ops.encode_boxes(
matched_gt_boxes, rois, weights=regression_weights)
# If the target is background, the box target is set to all 0s.
box_targets = tf.where(
tf.tile(
tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
[1, 1, 4]), tf.zeros_like(box_targets), box_targets)
model_outputs.update({
'class_targets_{}'.format(cascade_num)
if cascade_num else 'class_targets':
matched_gt_classes,
'box_targets_{}'.format(cascade_num)
if cascade_num else 'box_targets':
box_targets,
})
# Get roi features.
roi_features = self.roi_aligner(features, rois)
# Run frcnn head to get class and bbox predictions.
current_detection_head = self.detection_head[cascade_num]
class_outputs, box_outputs = current_detection_head(roi_features)
model_outputs.update({
'class_outputs_{}'.format(cascade_num)
if cascade_num else 'class_outputs':
class_outputs,
'box_outputs_{}'.format(cascade_num) if cascade_num else 'box_outputs':
box_outputs,
})
return (class_outputs, box_outputs, model_outputs, matched_gt_boxes,
matched_gt_classes, matched_gt_indices, rois)
def _features_to_mask_outputs(self, features, rois, roi_classes):
# Mask RoI align.
mask_roi_features = self.mask_roi_aligner(features, rois)
# Mask head.
raw_masks = self.mask_head([mask_roi_features, roi_classes])
return raw_masks, tf.nn.sigmoid(raw_masks)
@property
def checkpoint_items(
self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items = dict(
backbone=self.backbone,
rpn_head=self.rpn_head,
detection_head=self.detection_head)
if self.decoder is not None:
items.update(decoder=self.decoder)
if self._include_mask:
items.update(mask_head=self.mask_head)
return items
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for maskrcnn_model.py."""
import os
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.vision.modeling import maskrcnn_model
from official.vision.modeling.backbones import resnet
from official.vision.modeling.decoders import fpn
from official.vision.modeling.heads import dense_prediction_heads
from official.vision.modeling.heads import instance_heads
from official.vision.modeling.layers import detection_generator
from official.vision.modeling.layers import mask_sampler
from official.vision.modeling.layers import roi_aligner
from official.vision.modeling.layers import roi_generator
from official.vision.modeling.layers import roi_sampler
from official.vision.ops import anchor
class MaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase):
@combinations.generate(
combinations.combine(
include_mask=[True, False],
use_separable_conv=[True, False],
build_anchor_boxes=[True, False],
is_training=[True, False]))
def test_build_model(self, include_mask, use_separable_conv,
build_anchor_boxes, is_training):
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
anchor_size = 3
resnet_model_id = 50
num_anchors_per_location = num_scales * len(aspect_ratios)
image_size = 384
images = np.random.rand(2, image_size, image_size, 3)
image_shape = np.array([[image_size, image_size], [image_size, image_size]])
if build_anchor_boxes:
anchor_boxes = anchor.Anchor(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3,
image_size=(image_size, image_size)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
else:
anchor_boxes = None
backbone = resnet.ResNet(model_id=resnet_model_id)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level,
use_separable_conv=use_separable_conv)
rpn_head = dense_prediction_heads.RPNHead(
min_level=min_level,
max_level=max_level,
num_anchors_per_location=num_anchors_per_location,
num_convs=1)
detection_head = instance_heads.DetectionHead(num_classes=num_classes)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_obj = roi_sampler.ROISampler()
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(
num_classes=num_classes, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj,
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size)
gt_boxes = np.array(
[[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
[[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
dtype=np.float32)
gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
if include_mask:
gt_masks = np.ones((2, 3, 100, 100))
else:
gt_masks = None
# Results will be checked in test_forward.
_ = model(
images,
image_shape,
anchor_boxes,
gt_boxes,
gt_classes,
gt_masks,
training=is_training)
@combinations.generate(
combinations.combine(
strategy=[
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
include_mask=[True, False],
build_anchor_boxes=[True, False],
use_cascade_heads=[True, False],
training=[True, False],
))
def test_forward(self, strategy, include_mask, build_anchor_boxes, training,
use_cascade_heads):
num_classes = 3
min_level = 3
max_level = 4
num_scales = 3
aspect_ratios = [1.0]
anchor_size = 3
if use_cascade_heads:
cascade_iou_thresholds = [0.6]
class_agnostic_bbox_pred = True
cascade_class_ensemble = True
else:
cascade_iou_thresholds = None
class_agnostic_bbox_pred = False
cascade_class_ensemble = False
image_size = (256, 256)
images = np.random.rand(2, image_size[0], image_size[1], 3)
image_shape = np.array([[224, 100], [100, 224]])
with strategy.scope():
if build_anchor_boxes:
anchor_boxes = anchor.Anchor(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size,
image_size=image_size).multilevel_boxes
else:
anchor_boxes = None
num_anchors_per_location = len(aspect_ratios) * num_scales
input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
decoder = fpn.FPN(
min_level=min_level,
max_level=max_level,
input_specs=backbone.output_specs)
rpn_head = dense_prediction_heads.RPNHead(
min_level=min_level,
max_level=max_level,
num_anchors_per_location=num_anchors_per_location)
detection_head = instance_heads.DetectionHead(
num_classes=num_classes,
class_agnostic_bbox_pred=class_agnostic_bbox_pred)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_cascade = []
roi_sampler_obj = roi_sampler.ROISampler()
roi_sampler_cascade.append(roi_sampler_obj)
if cascade_iou_thresholds:
for iou in cascade_iou_thresholds:
roi_sampler_obj = roi_sampler.ROISampler(
mix_gt_boxes=False,
foreground_iou_threshold=iou,
background_iou_high_threshold=iou,
background_iou_low_threshold=0.0,
skip_subsampling=True)
roi_sampler_cascade.append(roi_sampler_obj)
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(
num_classes=num_classes, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj,
class_agnostic_bbox_pred=class_agnostic_bbox_pred,
cascade_class_ensemble=cascade_class_ensemble,
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size)
gt_boxes = np.array(
[[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]],
[[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]],
dtype=np.float32)
gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32)
if include_mask:
gt_masks = np.ones((2, 3, 100, 100))
else:
gt_masks = None
results = model(
images,
image_shape,
anchor_boxes,
gt_boxes,
gt_classes,
gt_masks,
training=training)
self.assertIn('rpn_boxes', results)
self.assertIn('rpn_scores', results)
if training:
self.assertIn('class_targets', results)
self.assertIn('box_targets', results)
self.assertIn('class_outputs', results)
self.assertIn('box_outputs', results)
if include_mask:
self.assertIn('mask_outputs', results)
else:
self.assertIn('detection_boxes', results)
self.assertIn('detection_scores', results)
self.assertIn('detection_classes', results)
self.assertIn('num_detections', results)
if include_mask:
self.assertIn('detection_masks', results)
@parameterized.parameters(
(False,),
(True,),
)
def test_serialize_deserialize(self, include_mask):
input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
decoder = fpn.FPN(
min_level=3, max_level=7, input_specs=backbone.output_specs)
rpn_head = dense_prediction_heads.RPNHead(
min_level=3, max_level=7, num_anchors_per_location=3)
detection_head = instance_heads.DetectionHead(num_classes=2)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_obj = roi_sampler.ROISampler()
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj,
min_level=3,
max_level=7,
num_scales=3,
aspect_ratios=[1.0],
anchor_size=3)
config = model.get_config()
new_model = maskrcnn_model.MaskRCNNModel.from_config(config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
@parameterized.parameters(
(False,),
(True,),
)
def test_checkpoint(self, include_mask):
input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3])
backbone = resnet.ResNet(model_id=50, input_specs=input_specs)
decoder = fpn.FPN(
min_level=3, max_level=7, input_specs=backbone.output_specs)
rpn_head = dense_prediction_heads.RPNHead(
min_level=3, max_level=7, num_anchors_per_location=3)
detection_head = instance_heads.DetectionHead(num_classes=2)
roi_generator_obj = roi_generator.MultilevelROIGenerator()
roi_sampler_obj = roi_sampler.ROISampler()
roi_aligner_obj = roi_aligner.MultilevelROIAligner()
detection_generator_obj = detection_generator.DetectionGenerator()
if include_mask:
mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2)
mask_sampler_obj = mask_sampler.MaskSampler(
mask_target_size=28, num_sampled_masks=1)
mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14)
else:
mask_head = None
mask_sampler_obj = None
mask_roi_aligner_obj = None
model = maskrcnn_model.MaskRCNNModel(
backbone,
decoder,
rpn_head,
detection_head,
roi_generator_obj,
roi_sampler_obj,
roi_aligner_obj,
detection_generator_obj,
mask_head,
mask_sampler_obj,
mask_roi_aligner_obj,
min_level=3,
max_level=7,
num_scales=3,
aspect_ratios=[1.0],
anchor_size=3)
expect_checkpoint_items = dict(
backbone=backbone,
decoder=decoder,
rpn_head=rpn_head,
detection_head=[detection_head])
if include_mask:
expect_checkpoint_items['mask_head'] = mask_head
self.assertAllEqual(expect_checkpoint_items, model.checkpoint_items)
# Test save and load checkpoints.
ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
save_dir = self.create_tempdir().full_path
ckpt.save(os.path.join(save_dir, 'ckpt'))
partial_ckpt = tf.train.Checkpoint(backbone=backbone)
partial_ckpt.read(tf.train.latest_checkpoint(
save_dir)).expect_partial().assert_existing_objects_matched()
if include_mask:
partial_ckpt_mask = tf.train.Checkpoint(
backbone=backbone, mask_head=mask_head)
partial_ckpt_mask.restore(tf.train.latest_checkpoint(
save_dir)).expect_partial().assert_existing_objects_matched()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RetinaNet."""
from typing import Any, Mapping, List, Optional, Union
# Import libraries
import tensorflow as tf
from official.vision.ops import anchor
@tf.keras.utils.register_keras_serializable(package='Vision')
class RetinaNetModel(tf.keras.Model):
"""The RetinaNet model class."""
def __init__(self,
backbone: tf.keras.Model,
decoder: tf.keras.Model,
head: tf.keras.layers.Layer,
detection_generator: tf.keras.layers.Layer,
min_level: Optional[int] = None,
max_level: Optional[int] = None,
num_scales: Optional[int] = None,
aspect_ratios: Optional[List[float]] = None,
anchor_size: Optional[float] = None,
**kwargs):
"""Classification initialization function.
Args:
backbone: `tf.keras.Model` a backbone network.
decoder: `tf.keras.Model` a decoder network.
head: `RetinaNetHead`, the RetinaNet head.
detection_generator: the detection generator.
min_level: Minimum level in output feature maps.
max_level: Maximum level in output feature maps.
num_scales: A number representing intermediate scales added
on each level. For instances, num_scales=2 adds one additional
intermediate anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: A list representing the aspect raito
anchors added on each level. The number indicates the ratio of width to
height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors
on each scale level.
anchor_size: A number representing the scale of size of the base
anchor to the feature stride 2^level.
**kwargs: keyword arguments to be passed.
"""
super(RetinaNetModel, self).__init__(**kwargs)
self._config_dict = {
'backbone': backbone,
'decoder': decoder,
'head': head,
'detection_generator': detection_generator,
'min_level': min_level,
'max_level': max_level,
'num_scales': num_scales,
'aspect_ratios': aspect_ratios,
'anchor_size': anchor_size,
}
self._backbone = backbone
self._decoder = decoder
self._head = head
self._detection_generator = detection_generator
def call(self,
images: tf.Tensor,
image_shape: Optional[tf.Tensor] = None,
anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
output_intermediate_features: bool = False,
training: bool = None) -> Mapping[str, tf.Tensor]:
"""Forward pass of the RetinaNet model.
Args:
images: `Tensor`, the input batched images, whose shape is
[batch, height, width, 3].
image_shape: `Tensor`, the actual shape of the input images, whose shape
is [batch, 2] where the last dimension is [height, width]. Note that
this is the actual image shape excluding paddings. For example, images
in the batch may be resized into different shapes before padding to the
fixed size.
anchor_boxes: a dict of tensors which includes multilevel anchors.
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the anchor coordinates of a particular feature
level, whose shape is [height_l, width_l, num_anchors_per_location].
output_intermediate_features: `bool` indicating whether to return the
intermediate feature maps generated by backbone and decoder.
training: `bool`, indicating whether it is in training mode.
Returns:
scores: a dict of tensors which includes scores of the predictions.
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the box scores predicted from a particular feature
level, whose shape is
[batch, height_l, width_l, num_classes * num_anchors_per_location].
boxes: a dict of tensors which includes coordinates of the predictions.
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the box coordinates predicted from a particular
feature level, whose shape is
[batch, height_l, width_l, 4 * num_anchors_per_location].
attributes: a dict of (attribute_name, attribute_predictions). Each
attribute prediction is a dict that includes:
- key: `str`, the level of the multilevel predictions.
- values: `Tensor`, the attribute predictions from a particular
feature level, whose shape is
[batch, height_l, width_l, att_size * num_anchors_per_location].
"""
outputs = {}
# Feature extraction.
features = self.backbone(images)
if output_intermediate_features:
outputs.update(
{'backbone_{}'.format(k): v for k, v in features.items()})
if self.decoder:
features = self.decoder(features)
if output_intermediate_features:
outputs.update(
{'decoder_{}'.format(k): v for k, v in features.items()})
# Dense prediction. `raw_attributes` can be empty.
raw_scores, raw_boxes, raw_attributes = self.head(features)
if training:
outputs.update({
'cls_outputs': raw_scores,
'box_outputs': raw_boxes,
})
if raw_attributes:
outputs.update({'attribute_outputs': raw_attributes})
return outputs
else:
# Generate anchor boxes for this batch if not provided.
if anchor_boxes is None:
_, image_height, image_width, _ = images.get_shape().as_list()
anchor_boxes = anchor.Anchor(
min_level=self._config_dict['min_level'],
max_level=self._config_dict['max_level'],
num_scales=self._config_dict['num_scales'],
aspect_ratios=self._config_dict['aspect_ratios'],
anchor_size=self._config_dict['anchor_size'],
image_size=(image_height, image_width)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0),
[tf.shape(images)[0], 1, 1, 1])
# Post-processing.
final_results = self.detection_generator(raw_boxes, raw_scores,
anchor_boxes, image_shape,
raw_attributes)
outputs.update({
'cls_outputs': raw_scores,
'box_outputs': raw_boxes,
})
if self.detection_generator.get_config()['apply_nms']:
outputs.update({
'detection_boxes': final_results['detection_boxes'],
'detection_scores': final_results['detection_scores'],
'detection_classes': final_results['detection_classes'],
'num_detections': final_results['num_detections']
})
else:
outputs.update({
'decoded_boxes': final_results['decoded_boxes'],
'decoded_box_scores': final_results['decoded_box_scores']
})
if raw_attributes:
outputs.update({
'attribute_outputs': raw_attributes,
'detection_attributes': final_results['detection_attributes'],
})
return outputs
@property
def checkpoint_items(
self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items = dict(backbone=self.backbone, head=self.head)
if self.decoder is not None:
items.update(decoder=self.decoder)
return items
@property
def backbone(self) -> tf.keras.Model:
return self._backbone
@property
def decoder(self) -> tf.keras.Model:
return self._decoder
@property
def head(self) -> tf.keras.layers.Layer:
return self._head
@property
def detection_generator(self) -> tf.keras.layers.Layer:
return self._detection_generator
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for RetinaNet models."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.vision.modeling import retinanet_model
from official.vision.modeling.backbones import resnet
from official.vision.modeling.decoders import fpn
from official.vision.modeling.heads import dense_prediction_heads
from official.vision.modeling.layers import detection_generator
from official.vision.ops import anchor
class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
{
'use_separable_conv': True,
'build_anchor_boxes': True,
'is_training': False,
'has_att_heads': False
},
{
'use_separable_conv': False,
'build_anchor_boxes': True,
'is_training': False,
'has_att_heads': False
},
{
'use_separable_conv': False,
'build_anchor_boxes': False,
'is_training': False,
'has_att_heads': False
},
{
'use_separable_conv': False,
'build_anchor_boxes': False,
'is_training': True,
'has_att_heads': False
},
{
'use_separable_conv': False,
'build_anchor_boxes': True,
'is_training': True,
'has_att_heads': True
},
{
'use_separable_conv': False,
'build_anchor_boxes': True,
'is_training': False,
'has_att_heads': True
},
)
def test_build_model(self, use_separable_conv, build_anchor_boxes,
is_training, has_att_heads):
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
anchor_size = 3
fpn_num_filters = 256
head_num_convs = 4
head_num_filters = 256
num_anchors_per_location = num_scales * len(aspect_ratios)
image_size = 384
images = np.random.rand(2, image_size, image_size, 3)
image_shape = np.array([[image_size, image_size], [image_size, image_size]])
if build_anchor_boxes:
anchor_boxes = anchor.Anchor(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size,
image_size=(image_size, image_size)).multilevel_boxes
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
else:
anchor_boxes = None
if has_att_heads:
attribute_heads = [dict(name='depth', type='regression', size=1)]
else:
attribute_heads = None
backbone = resnet.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level,
num_filters=fpn_num_filters,
use_separable_conv=use_separable_conv)
head = dense_prediction_heads.RetinaNetHead(
min_level=min_level,
max_level=max_level,
num_classes=num_classes,
attribute_heads=attribute_heads,
num_anchors_per_location=num_anchors_per_location,
use_separable_conv=use_separable_conv,
num_convs=head_num_convs,
num_filters=head_num_filters)
generator = detection_generator.MultilevelDetectionGenerator(
max_num_detections=10)
model = retinanet_model.RetinaNetModel(
backbone=backbone,
decoder=decoder,
head=head,
detection_generator=generator,
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=anchor_size)
_ = model(images, image_shape, anchor_boxes, training=is_training)
@combinations.generate(
combinations.combine(
strategy=[
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
image_size=[
(128, 128),
],
training=[True, False],
has_att_heads=[True, False],
output_intermediate_features=[True, False],
soft_nms_sigma=[None, 0.0, 0.1],
))
def test_forward(self, strategy, image_size, training, has_att_heads,
output_intermediate_features, soft_nms_sigma):
"""Test for creation of a R50-FPN RetinaNet."""
tf.keras.backend.set_image_data_format('channels_last')
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
num_anchors_per_location = num_scales * len(aspect_ratios)
images = np.random.rand(2, image_size[0], image_size[1], 3)
image_shape = np.array(
[[image_size[0], image_size[1]], [image_size[0], image_size[1]]])
with strategy.scope():
anchor_gen = anchor.build_anchor_generator(
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3)
anchor_boxes = anchor_gen(image_size)
for l in anchor_boxes:
anchor_boxes[l] = tf.tile(
tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
backbone = resnet.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level)
if has_att_heads:
attribute_heads = [dict(name='depth', type='regression', size=1)]
else:
attribute_heads = None
head = dense_prediction_heads.RetinaNetHead(
min_level=min_level,
max_level=max_level,
num_classes=num_classes,
attribute_heads=attribute_heads,
num_anchors_per_location=num_anchors_per_location)
generator = detection_generator.MultilevelDetectionGenerator(
max_num_detections=10,
nms_version='v1',
use_cpu_nms=soft_nms_sigma is not None,
soft_nms_sigma=soft_nms_sigma)
model = retinanet_model.RetinaNetModel(
backbone=backbone,
decoder=decoder,
head=head,
detection_generator=generator)
model_outputs = model(
images,
image_shape,
anchor_boxes,
output_intermediate_features=output_intermediate_features,
training=training)
if training:
cls_outputs = model_outputs['cls_outputs']
box_outputs = model_outputs['box_outputs']
for level in range(min_level, max_level + 1):
self.assertIn(str(level), cls_outputs)
self.assertIn(str(level), box_outputs)
self.assertAllEqual([
2,
image_size[0] // 2**level,
image_size[1] // 2**level,
num_classes * num_anchors_per_location
], cls_outputs[str(level)].numpy().shape)
self.assertAllEqual([
2,
image_size[0] // 2**level,
image_size[1] // 2**level,
4 * num_anchors_per_location
], box_outputs[str(level)].numpy().shape)
if has_att_heads:
att_outputs = model_outputs['attribute_outputs']
for att in att_outputs.values():
self.assertAllEqual([
2, image_size[0] // 2**level, image_size[1] // 2**level,
1 * num_anchors_per_location
], att[str(level)].numpy().shape)
else:
self.assertIn('detection_boxes', model_outputs)
self.assertIn('detection_scores', model_outputs)
self.assertIn('detection_classes', model_outputs)
self.assertIn('num_detections', model_outputs)
self.assertAllEqual(
[2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
self.assertAllEqual(
[2, 10], model_outputs['detection_scores'].numpy().shape)
self.assertAllEqual(
[2, 10], model_outputs['detection_classes'].numpy().shape)
self.assertAllEqual(
[2,], model_outputs['num_detections'].numpy().shape)
if has_att_heads:
self.assertIn('detection_attributes', model_outputs)
self.assertAllEqual(
[2, 10, 1],
model_outputs['detection_attributes']['depth'].numpy().shape)
if output_intermediate_features:
for l in range(2, 6):
self.assertIn('backbone_{}'.format(l), model_outputs)
self.assertAllEqual([
2, image_size[0] // 2**l, image_size[1] // 2**l,
backbone.output_specs[str(l)].as_list()[-1]
], model_outputs['backbone_{}'.format(l)].numpy().shape)
for l in range(min_level, max_level + 1):
self.assertIn('decoder_{}'.format(l), model_outputs)
self.assertAllEqual([
2, image_size[0] // 2**l, image_size[1] // 2**l,
decoder.output_specs[str(l)].as_list()[-1]
], model_outputs['decoder_{}'.format(l)].numpy().shape)
def test_serialize_deserialize(self):
"""Validate the network can be serialized and deserialized."""
num_classes = 3
min_level = 3
max_level = 7
num_scales = 3
aspect_ratios = [1.0]
num_anchors_per_location = num_scales * len(aspect_ratios)
backbone = resnet.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs,
min_level=min_level,
max_level=max_level)
head = dense_prediction_heads.RetinaNetHead(
min_level=min_level,
max_level=max_level,
num_classes=num_classes,
num_anchors_per_location=num_anchors_per_location)
generator = detection_generator.MultilevelDetectionGenerator(
max_num_detections=10)
model = retinanet_model.RetinaNetModel(
backbone=backbone,
decoder=decoder,
head=head,
detection_generator=generator,
min_level=min_level,
max_level=max_level,
num_scales=num_scales,
aspect_ratios=aspect_ratios,
anchor_size=3)
config = model.get_config()
new_model = retinanet_model.RetinaNetModel.from_config(config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build segmentation models."""
from typing import Any, Mapping, Union, Optional, Dict
# Import libraries
import tensorflow as tf
layers = tf.keras.layers
@tf.keras.utils.register_keras_serializable(package='Vision')
class SegmentationModel(tf.keras.Model):
"""A Segmentation class model.
Input images are passed through backbone first. Decoder network is then
applied, and finally, segmentation head is applied on the output of the
decoder network. Layers such as ASPP should be part of decoder. Any feature
fusion is done as part of the segmentation head (i.e. deeplabv3+ feature
fusion is not part of the decoder, instead it is part of the segmentation
head). This way, different feature fusion techniques can be combined with
different backbones, and decoders.
"""
def __init__(self, backbone: tf.keras.Model, decoder: tf.keras.Model,
head: tf.keras.layers.Layer,
mask_scoring_head: Optional[tf.keras.layers.Layer] = None,
**kwargs):
"""Segmentation initialization function.
Args:
backbone: a backbone network.
decoder: a decoder network. E.g. FPN.
head: segmentation head.
mask_scoring_head: mask scoring head.
**kwargs: keyword arguments to be passed.
"""
super(SegmentationModel, self).__init__(**kwargs)
self._config_dict = {
'backbone': backbone,
'decoder': decoder,
'head': head,
'mask_scoring_head': mask_scoring_head,
}
self.backbone = backbone
self.decoder = decoder
self.head = head
self.mask_scoring_head = mask_scoring_head
def call(self, inputs: tf.Tensor, training: bool = None
) -> Dict[str, tf.Tensor]:
backbone_features = self.backbone(inputs)
if self.decoder:
decoder_features = self.decoder(backbone_features)
else:
decoder_features = backbone_features
logits = self.head((backbone_features, decoder_features))
outputs = {'logits': logits}
if self.mask_scoring_head:
mask_scores = self.mask_scoring_head(logits)
outputs.update({'mask_scores': mask_scores})
return outputs
@property
def checkpoint_items(
self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
"""Returns a dictionary of items to be additionally checkpointed."""
items = dict(backbone=self.backbone, head=self.head)
if self.decoder is not None:
items.update(decoder=self.decoder)
if self.mask_scoring_head is not None:
items.update(mask_scoring_head=self.mask_scoring_head)
return items
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for segmentation network."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.modeling import backbones
from official.vision.modeling import segmentation_model
from official.vision.modeling.decoders import fpn
from official.vision.modeling.heads import segmentation_heads
class SegmentationNetworkTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(128, 2),
(128, 3),
(128, 4),
(256, 2),
(256, 3),
(256, 4),
)
def test_segmentation_network_creation(
self, input_size, level):
"""Test for creation of a segmentation network."""
num_classes = 10
inputs = np.random.rand(2, input_size, input_size, 3)
tf.keras.backend.set_image_data_format('channels_last')
backbone = backbones.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs, min_level=2, max_level=7)
head = segmentation_heads.SegmentationHead(num_classes, level=level)
model = segmentation_model.SegmentationModel(
backbone=backbone,
decoder=decoder,
head=head,
mask_scoring_head=None,
)
outputs = model(inputs)
self.assertAllEqual(
[2, input_size // (2**level), input_size // (2**level), num_classes],
outputs['logits'].numpy().shape)
def test_serialize_deserialize(self):
"""Validate the network can be serialized and deserialized."""
num_classes = 3
backbone = backbones.ResNet(model_id=50)
decoder = fpn.FPN(
input_specs=backbone.output_specs, min_level=3, max_level=7)
head = segmentation_heads.SegmentationHead(num_classes, level=3)
model = segmentation_model.SegmentationModel(
backbone=backbone,
decoder=decoder,
head=head
)
config = model.get_config()
new_model = segmentation_model.SegmentationModel.from_config(config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build video classification models."""
from typing import Any, Mapping, Optional, Union, List, Text
import tensorflow as tf
layers = tf.keras.layers
@tf.keras.utils.register_keras_serializable(package='Vision')
class VideoClassificationModel(tf.keras.Model):
"""A video classification class builder."""
def __init__(
self,
backbone: tf.keras.Model,
num_classes: int,
input_specs: Optional[Mapping[str, tf.keras.layers.InputSpec]] = None,
dropout_rate: float = 0.0,
aggregate_endpoints: bool = False,
kernel_initializer: str = 'random_uniform',
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
require_endpoints: Optional[List[Text]] = None,
**kwargs):
"""Video Classification initialization function.
Args:
backbone: a 3d backbone network.
num_classes: `int` number of classes in classification task.
input_specs: `tf.keras.layers.InputSpec` specs of the input tensor.
dropout_rate: `float` rate for dropout regularization.
aggregate_endpoints: `bool` aggregate all end ponits or only use the
final end point.
kernel_initializer: kernel initializer for the dense layer.
kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to
None.
bias_regularizer: tf.keras.regularizers.Regularizer object. Default to
None.
require_endpoints: the required endpoints for prediction. If None or
empty, then only uses the final endpoint.
**kwargs: keyword arguments to be passed.
"""
if not input_specs:
input_specs = {
'image': layers.InputSpec(shape=[None, None, None, None, 3])
}
self._self_setattr_tracking = False
self._config_dict = {
'backbone': backbone,
'num_classes': num_classes,
'input_specs': input_specs,
'dropout_rate': dropout_rate,
'aggregate_endpoints': aggregate_endpoints,
'kernel_initializer': kernel_initializer,
'kernel_regularizer': kernel_regularizer,
'bias_regularizer': bias_regularizer,
'require_endpoints': require_endpoints,
}
self._input_specs = input_specs
self._kernel_regularizer = kernel_regularizer
self._bias_regularizer = bias_regularizer
self._backbone = backbone
inputs = {
k: tf.keras.Input(shape=v.shape[1:]) for k, v in input_specs.items()
}
endpoints = backbone(inputs['image'])
if aggregate_endpoints:
pooled_feats = []
for endpoint in endpoints.values():
x_pool = tf.keras.layers.GlobalAveragePooling3D()(endpoint)
pooled_feats.append(x_pool)
x = tf.concat(pooled_feats, axis=1)
else:
if not require_endpoints:
# Uses the last endpoint for prediction.
x = endpoints[max(endpoints.keys())]
x = tf.keras.layers.GlobalAveragePooling3D()(x)
else:
# Concats all the required endpoints for prediction.
outputs = []
for name in require_endpoints:
x = endpoints[name]
x = tf.keras.layers.GlobalAveragePooling3D()(x)
outputs.append(x)
x = tf.concat(outputs, axis=1)
x = tf.keras.layers.Dropout(dropout_rate)(x)
x = tf.keras.layers.Dense(
num_classes, kernel_initializer=kernel_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer)(
x)
super(VideoClassificationModel, self).__init__(
inputs=inputs, outputs=x, **kwargs)
@property
def checkpoint_items(
self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]:
"""Returns a dictionary of items to be additionally checkpointed."""
return dict(backbone=self.backbone)
@property
def backbone(self) -> tf.keras.Model:
return self._backbone
def get_config(self) -> Mapping[str, Any]:
return self._config_dict
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for video classification network."""
# Import libraries
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from official.vision.modeling import backbones
from official.vision.modeling import video_classification_model
class VideoClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(50, 8, 112, 'relu', False),
(50, 8, 112, 'swish', True),
)
def test_resnet3d_network_creation(self, model_id, temporal_size,
spatial_size, activation,
aggregate_endpoints):
"""Test for creation of a ResNet3D-50 classifier."""
input_specs = tf.keras.layers.InputSpec(
shape=[None, temporal_size, spatial_size, spatial_size, 3])
temporal_strides = [1, 1, 1, 1]
temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
(1, 3, 1)]
tf.keras.backend.set_image_data_format('channels_last')
backbone = backbones.ResNet3D(
model_id=model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes,
input_specs=input_specs,
activation=activation)
num_classes = 1000
model = video_classification_model.VideoClassificationModel(
backbone=backbone,
num_classes=num_classes,
input_specs={'image': input_specs},
dropout_rate=0.2,
aggregate_endpoints=aggregate_endpoints,
)
inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3)
logits = model(inputs)
self.assertAllEqual([2, num_classes], logits.numpy().shape)
def test_serialize_deserialize(self):
"""Validate the classification network can be serialized and deserialized."""
model_id = 50
temporal_strides = [1, 1, 1, 1]
temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1),
(1, 3, 1)]
backbone = backbones.ResNet3D(
model_id=model_id,
temporal_strides=temporal_strides,
temporal_kernel_sizes=temporal_kernel_sizes)
model = video_classification_model.VideoClassificationModel(
backbone=backbone, num_classes=1000)
config = model.get_config()
new_model = video_classification_model.VideoClassificationModel.from_config(
config)
# Validate that the config can be forced to JSON.
_ = new_model.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(model.get_config(), new_model.get_config())
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment