Unverified Commit 420a7253 authored by pkulzc's avatar pkulzc Committed by GitHub
Browse files

Refactor tests for Object Detection API. (#8688)

Internal changes

--

PiperOrigin-RevId: 316837667
parent d0ef3913
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for the CenterNet Meta architecture code."""
from __future__ import division
import functools
import unittest
from absl.testing import parameterized
import numpy as np
import tensorflow.compat.v1 as tf
from object_detection.core import losses
from object_detection.core import preprocessor
from object_detection.core import standard_fields as fields
from object_detection.core import target_assigner as cn_assigner
from object_detection.meta_architectures import center_net_meta_arch as cnma
from object_detection.models import center_net_resnet_feature_extractor
from object_detection.utils import test_case
from object_detection.utils import tf_version
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CenterNetMetaArchPredictionHeadTest(test_case.TestCase):
"""Test CenterNet meta architecture prediction head."""
def test_prediction_head(self):
head = cnma.make_prediction_net(num_out_channels=7)
output = head(np.zeros((4, 128, 128, 8)))
self.assertEqual((4, 128, 128, 7), output.shape)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CenterNetMetaArchHelpersTest(test_case.TestCase, parameterized.TestCase):
"""Test for CenterNet meta architecture related functions."""
def test_row_col_indices_from_flattened_indices(self):
"""Tests that the computation of row, col, channel indices is correct."""
r_grid, c_grid, ch_grid = (np.zeros((5, 4, 3), dtype=np.int),
np.zeros((5, 4, 3), dtype=np.int),
np.zeros((5, 4, 3), dtype=np.int))
r_grid[..., 0] = r_grid[..., 1] = r_grid[..., 2] = np.array(
[[0, 0, 0, 0],
[1, 1, 1, 1],
[2, 2, 2, 2],
[3, 3, 3, 3],
[4, 4, 4, 4]]
)
c_grid[..., 0] = c_grid[..., 1] = c_grid[..., 2] = np.array(
[[0, 1, 2, 3],
[0, 1, 2, 3],
[0, 1, 2, 3],
[0, 1, 2, 3],
[0, 1, 2, 3]]
)
for i in range(3):
ch_grid[..., i] = i
indices = np.arange(60)
ri, ci, chi = cnma.row_col_channel_indices_from_flattened_indices(
indices, 4, 3)
np.testing.assert_array_equal(ri, r_grid.flatten())
np.testing.assert_array_equal(ci, c_grid.flatten())
np.testing.assert_array_equal(chi, ch_grid.flatten())
def test_flattened_indices_from_row_col_indices(self):
r = np.array(
[[0, 0, 0, 0],
[1, 1, 1, 1],
[2, 2, 2, 2]]
)
c = np.array(
[[0, 1, 2, 3],
[0, 1, 2, 3],
[0, 1, 2, 3]]
)
idx = cnma.flattened_indices_from_row_col_indices(r, c, 4)
np.testing.assert_array_equal(np.arange(12), idx.flatten())
def test_get_valid_anchor_weights_in_flattened_image(self):
"""Tests that the anchor weights are valid upon flattening out."""
valid_weights = np.zeros((2, 5, 5), dtype=np.float)
valid_weights[0, :3, :4] = 1.0
valid_weights[1, :2, :2] = 1.0
def graph_fn():
true_image_shapes = tf.constant([[3, 4], [2, 2]])
w = cnma.get_valid_anchor_weights_in_flattened_image(
true_image_shapes, 5, 5)
return w
w = self.execute(graph_fn, [])
np.testing.assert_allclose(w, valid_weights.reshape(2, -1))
self.assertEqual((2, 25), w.shape)
def test_convert_strided_predictions_to_normalized_boxes(self):
"""Tests that boxes have correct coordinates in normalized input space."""
def graph_fn():
boxes = np.zeros((2, 3, 4), dtype=np.float32)
boxes[0] = [[10, 20, 30, 40], [20, 30, 50, 100], [50, 60, 100, 180]]
boxes[1] = [[-5, -5, 5, 5], [45, 60, 110, 120], [150, 150, 200, 250]]
true_image_shapes = tf.constant([[100, 90, 3], [150, 150, 3]])
clipped_boxes = (
cnma.convert_strided_predictions_to_normalized_boxes(
boxes, 2, true_image_shapes))
return clipped_boxes
clipped_boxes = self.execute(graph_fn, [])
expected_boxes = np.zeros((2, 3, 4), dtype=np.float32)
expected_boxes[0] = [[0.2, 4./9, 0.6, 8./9], [0.4, 2./3, 1, 1],
[1, 1, 1, 1]]
expected_boxes[1] = [[0., 0, 1./15, 1./15], [3./5, 4./5, 1, 1],
[1, 1, 1, 1]]
np.testing.assert_allclose(expected_boxes, clipped_boxes)
@parameterized.parameters(
{'clip_to_window': True},
{'clip_to_window': False}
)
def test_convert_strided_predictions_to_normalized_keypoints(
self, clip_to_window):
"""Tests that keypoints have correct coordinates in normalized coords."""
keypoint_coords_np = np.array(
[
# Example 0.
[
[[-10., 8.], [60., 22.], [60., 120.]],
[[20., 20.], [0., 0.], [0., 0.]],
],
# Example 1.
[
[[40., 50.], [20., 160.], [200., 150.]],
[[10., 0.], [40., 10.], [0., 0.]],
],
], dtype=np.float32)
keypoint_scores_np = np.array(
[
# Example 0.
[
[1.0, 0.9, 0.2],
[0.7, 0.0, 0.0],
],
# Example 1.
[
[1.0, 1.0, 0.2],
[0.7, 0.6, 0.0],
],
], dtype=np.float32)
def graph_fn():
keypoint_coords = tf.constant(keypoint_coords_np, dtype=tf.float32)
keypoint_scores = tf.constant(keypoint_scores_np, dtype=tf.float32)
true_image_shapes = tf.constant([[320, 400, 3], [640, 640, 3]])
stride = 4
keypoint_coords_out, keypoint_scores_out = (
cnma.convert_strided_predictions_to_normalized_keypoints(
keypoint_coords, keypoint_scores, stride, true_image_shapes,
clip_to_window))
return keypoint_coords_out, keypoint_scores_out
keypoint_coords_out, keypoint_scores_out = self.execute(graph_fn, [])
if clip_to_window:
expected_keypoint_coords_np = np.array(
[
# Example 0.
[
[[0.0, 0.08], [0.75, 0.22], [0.75, 1.0]],
[[0.25, 0.2], [0., 0.], [0.0, 0.0]],
],
# Example 1.
[
[[0.25, 0.3125], [0.125, 1.0], [1.0, 0.9375]],
[[0.0625, 0.], [0.25, 0.0625], [0., 0.]],
],
], dtype=np.float32)
expected_keypoint_scores_np = np.array(
[
# Example 0.
[
[0.0, 0.9, 0.0],
[0.7, 0.0, 0.0],
],
# Example 1.
[
[1.0, 1.0, 0.0],
[0.7, 0.6, 0.0],
],
], dtype=np.float32)
else:
expected_keypoint_coords_np = np.array(
[
# Example 0.
[
[[-0.125, 0.08], [0.75, 0.22], [0.75, 1.2]],
[[0.25, 0.2], [0., 0.], [0., 0.]],
],
# Example 1.
[
[[0.25, 0.3125], [0.125, 1.0], [1.25, 0.9375]],
[[0.0625, 0.], [0.25, 0.0625], [0., 0.]],
],
], dtype=np.float32)
expected_keypoint_scores_np = np.array(
[
# Example 0.
[
[1.0, 0.9, 0.2],
[0.7, 0.0, 0.0],
],
# Example 1.
[
[1.0, 1.0, 0.2],
[0.7, 0.6, 0.0],
],
], dtype=np.float32)
np.testing.assert_allclose(expected_keypoint_coords_np, keypoint_coords_out)
np.testing.assert_allclose(expected_keypoint_scores_np, keypoint_scores_out)
def test_convert_strided_predictions_to_instance_masks(self):
def graph_fn():
boxes = tf.constant(
[
[[0.5, 0.5, 1.0, 1.0],
[0.0, 0.5, 0.5, 1.0],
[0.0, 0.0, 0.0, 0.0]],
], tf.float32)
classes = tf.constant(
[
[0, 1, 0],
], tf.int32)
masks_np = np.zeros((1, 4, 4, 2), dtype=np.float32)
masks_np[0, :, 2:, 0] = 1 # Class 0.
masks_np[0, :, :3, 1] = 1 # Class 1.
masks = tf.constant(masks_np)
true_image_shapes = tf.constant([[6, 8, 3]])
instance_masks = cnma.convert_strided_predictions_to_instance_masks(
boxes, classes, masks, stride=2, mask_height=2, mask_width=2,
true_image_shapes=true_image_shapes)
return instance_masks
instance_masks = self.execute_cpu(graph_fn, [])
expected_instance_masks = np.array(
[
[
# Mask 0 (class 0).
[[1, 1],
[1, 1]],
# Mask 1 (class 1).
[[1, 0],
[1, 0]],
# Mask 2 (class 0).
[[0, 0],
[0, 0]],
]
])
np.testing.assert_array_equal(expected_instance_masks, instance_masks)
def test_top_k_feature_map_locations(self):
feature_map_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
feature_map_np[0, 2, 0, 1] = 1.0
feature_map_np[0, 2, 1, 1] = 0.9 # Get's filtered due to max pool.
feature_map_np[0, 0, 1, 0] = 0.7
feature_map_np[0, 2, 2, 0] = 0.5
feature_map_np[0, 2, 2, 1] = -0.3
feature_map_np[1, 2, 1, 1] = 0.7
feature_map_np[1, 1, 0, 0] = 0.4
feature_map_np[1, 1, 2, 0] = 0.1
def graph_fn():
feature_map = tf.constant(feature_map_np)
scores, y_inds, x_inds, channel_inds = (
cnma.top_k_feature_map_locations(
feature_map, max_pool_kernel_size=3, k=3))
return scores, y_inds, x_inds, channel_inds
scores, y_inds, x_inds, channel_inds = self.execute(graph_fn, [])
np.testing.assert_allclose([1.0, 0.7, 0.5], scores[0])
np.testing.assert_array_equal([2, 0, 2], y_inds[0])
np.testing.assert_array_equal([0, 1, 2], x_inds[0])
np.testing.assert_array_equal([1, 0, 0], channel_inds[0])
np.testing.assert_allclose([0.7, 0.4, 0.1], scores[1])
np.testing.assert_array_equal([2, 1, 1], y_inds[1])
np.testing.assert_array_equal([1, 0, 2], x_inds[1])
np.testing.assert_array_equal([1, 0, 0], channel_inds[1])
def test_top_k_feature_map_locations_no_pooling(self):
feature_map_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
feature_map_np[0, 2, 0, 1] = 1.0
feature_map_np[0, 2, 1, 1] = 0.9
feature_map_np[0, 0, 1, 0] = 0.7
feature_map_np[0, 2, 2, 0] = 0.5
feature_map_np[0, 2, 2, 1] = -0.3
feature_map_np[1, 2, 1, 1] = 0.7
feature_map_np[1, 1, 0, 0] = 0.4
feature_map_np[1, 1, 2, 0] = 0.1
def graph_fn():
feature_map = tf.constant(feature_map_np)
scores, y_inds, x_inds, channel_inds = (
cnma.top_k_feature_map_locations(
feature_map, max_pool_kernel_size=1, k=3))
return scores, y_inds, x_inds, channel_inds
scores, y_inds, x_inds, channel_inds = self.execute(graph_fn, [])
np.testing.assert_allclose([1.0, 0.9, 0.7], scores[0])
np.testing.assert_array_equal([2, 2, 0], y_inds[0])
np.testing.assert_array_equal([0, 1, 1], x_inds[0])
np.testing.assert_array_equal([1, 1, 0], channel_inds[0])
np.testing.assert_allclose([0.7, 0.4, 0.1], scores[1])
np.testing.assert_array_equal([2, 1, 1], y_inds[1])
np.testing.assert_array_equal([1, 0, 2], x_inds[1])
np.testing.assert_array_equal([1, 0, 0], channel_inds[1])
def test_top_k_feature_map_locations_per_channel(self):
feature_map_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
feature_map_np[0, 2, 0, 0] = 1.0 # Selected.
feature_map_np[0, 2, 1, 0] = 0.9 # Get's filtered due to max pool.
feature_map_np[0, 0, 1, 0] = 0.7 # Selected.
feature_map_np[0, 2, 2, 1] = 0.5 # Selected.
feature_map_np[0, 0, 0, 1] = 0.3 # Selected.
feature_map_np[1, 2, 1, 0] = 0.7 # Selected.
feature_map_np[1, 1, 0, 0] = 0.4 # Get's filtered due to max pool.
feature_map_np[1, 1, 2, 0] = 0.3 # Get's filtered due to max pool.
feature_map_np[1, 1, 0, 1] = 0.8 # Selected.
feature_map_np[1, 1, 2, 1] = 0.3 # Selected.
def graph_fn():
feature_map = tf.constant(feature_map_np)
scores, y_inds, x_inds, channel_inds = (
cnma.top_k_feature_map_locations(
feature_map, max_pool_kernel_size=3, k=2, per_channel=True))
return scores, y_inds, x_inds, channel_inds
scores, y_inds, x_inds, channel_inds = self.execute(graph_fn, [])
np.testing.assert_allclose([1.0, 0.7, 0.5, 0.3], scores[0])
np.testing.assert_array_equal([2, 0, 2, 0], y_inds[0])
np.testing.assert_array_equal([0, 1, 2, 0], x_inds[0])
np.testing.assert_array_equal([0, 0, 1, 1], channel_inds[0])
np.testing.assert_allclose([0.7, 0.0, 0.8, 0.3], scores[1])
np.testing.assert_array_equal([2, 0, 1, 1], y_inds[1])
np.testing.assert_array_equal([1, 0, 0, 2], x_inds[1])
np.testing.assert_array_equal([0, 0, 1, 1], channel_inds[1])
def test_box_prediction(self):
class_pred = np.zeros((3, 128, 128, 5), dtype=np.float32)
hw_pred = np.zeros((3, 128, 128, 2), dtype=np.float32)
offset_pred = np.zeros((3, 128, 128, 2), dtype=np.float32)
# Sample 1, 2 boxes
class_pred[0, 10, 20] = [0.3, .7, 0.0, 0.0, 0.0]
hw_pred[0, 10, 20] = [40, 60]
offset_pred[0, 10, 20] = [1, 2]
class_pred[0, 50, 60] = [0.55, 0.0, 0.0, 0.0, 0.45]
hw_pred[0, 50, 60] = [50, 50]
offset_pred[0, 50, 60] = [0, 0]
# Sample 2, 2 boxes (at same location)
class_pred[1, 100, 100] = [0.0, 0.1, 0.9, 0.0, 0.0]
hw_pred[1, 100, 100] = [10, 10]
offset_pred[1, 100, 100] = [1, 3]
# Sample 3, 3 boxes
class_pred[2, 60, 90] = [0.0, 0.0, 0.0, 0.2, 0.8]
hw_pred[2, 60, 90] = [40, 30]
offset_pred[2, 60, 90] = [0, 0]
class_pred[2, 65, 95] = [0.0, 0.7, 0.3, 0.0, 0.0]
hw_pred[2, 65, 95] = [20, 20]
offset_pred[2, 65, 95] = [1, 2]
class_pred[2, 75, 85] = [1.0, 0.0, 0.0, 0.0, 0.0]
hw_pred[2, 75, 85] = [21, 25]
offset_pred[2, 75, 85] = [5, 2]
def graph_fn():
class_pred_tensor = tf.constant(class_pred)
hw_pred_tensor = tf.constant(hw_pred)
offset_pred_tensor = tf.constant(offset_pred)
detection_scores, y_indices, x_indices, channel_indices = (
cnma.top_k_feature_map_locations(
class_pred_tensor, max_pool_kernel_size=3, k=2))
boxes, classes, scores, num_dets = cnma.prediction_tensors_to_boxes(
detection_scores, y_indices, x_indices, channel_indices,
hw_pred_tensor, offset_pred_tensor)
return boxes, classes, scores, num_dets
boxes, classes, scores, num_dets = self.execute(graph_fn, [])
np.testing.assert_array_equal(num_dets, [2, 2, 2])
np.testing.assert_allclose(
[[-9, -8, 31, 52], [25, 35, 75, 85]], boxes[0])
np.testing.assert_allclose(
[[96, 98, 106, 108], [96, 98, 106, 108]], boxes[1])
np.testing.assert_allclose(
[[69.5, 74.5, 90.5, 99.5], [40, 75, 80, 105]], boxes[2])
np.testing.assert_array_equal(classes[0], [1, 0])
np.testing.assert_array_equal(classes[1], [2, 1])
np.testing.assert_array_equal(classes[2], [0, 4])
np.testing.assert_allclose(scores[0], [.7, .55])
np.testing.assert_allclose(scores[1][:1], [.9])
np.testing.assert_allclose(scores[2], [1., .8])
def test_keypoint_candidate_prediction(self):
keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
keypoint_heatmap_np[0, 0, 0, 0] = 1.0
keypoint_heatmap_np[0, 2, 1, 0] = 0.7
keypoint_heatmap_np[0, 1, 1, 0] = 0.6
keypoint_heatmap_np[0, 0, 2, 1] = 0.7
keypoint_heatmap_np[0, 1, 1, 1] = 0.3 # Filtered by low score.
keypoint_heatmap_np[0, 2, 2, 1] = 0.2
keypoint_heatmap_np[1, 1, 0, 0] = 0.6
keypoint_heatmap_np[1, 2, 1, 0] = 0.5
keypoint_heatmap_np[1, 0, 0, 0] = 0.4
keypoint_heatmap_np[1, 0, 0, 1] = 1.0
keypoint_heatmap_np[1, 0, 1, 1] = 0.9
keypoint_heatmap_np[1, 2, 0, 1] = 0.8
keypoint_heatmap_offsets_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
keypoint_heatmap_offsets_np[0, 0, 0] = [0.5, 0.25]
keypoint_heatmap_offsets_np[0, 2, 1] = [-0.25, 0.5]
keypoint_heatmap_offsets_np[0, 1, 1] = [0.0, 0.0]
keypoint_heatmap_offsets_np[0, 0, 2] = [1.0, 0.0]
keypoint_heatmap_offsets_np[0, 2, 2] = [1.0, 1.0]
keypoint_heatmap_offsets_np[1, 1, 0] = [0.25, 0.5]
keypoint_heatmap_offsets_np[1, 2, 1] = [0.5, 0.0]
keypoint_heatmap_offsets_np[1, 0, 0] = [0.0, -0.5]
keypoint_heatmap_offsets_np[1, 0, 1] = [0.5, -0.5]
keypoint_heatmap_offsets_np[1, 2, 0] = [-1.0, -0.5]
def graph_fn():
keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
keypoint_heatmap_offsets = tf.constant(
keypoint_heatmap_offsets_np, dtype=tf.float32)
keypoint_cands, keypoint_scores, num_keypoint_candidates = (
cnma.prediction_tensors_to_keypoint_candidates(
keypoint_heatmap,
keypoint_heatmap_offsets,
keypoint_score_threshold=0.5,
max_pool_kernel_size=1,
max_candidates=2))
return keypoint_cands, keypoint_scores, num_keypoint_candidates
(keypoint_cands, keypoint_scores,
num_keypoint_candidates) = self.execute(graph_fn, [])
expected_keypoint_candidates = [
[ # Example 0.
[[0.5, 0.25], [1.0, 2.0]], # Keypoint 1.
[[1.75, 1.5], [1.0, 1.0]], # Keypoint 2.
],
[ # Example 1.
[[1.25, 0.5], [0.0, -0.5]], # Keypoint 1.
[[2.5, 1.0], [0.5, 0.5]], # Keypoint 2.
],
]
expected_keypoint_scores = [
[ # Example 0.
[1.0, 0.7], # Keypoint 1.
[0.7, 0.3], # Keypoint 2.
],
[ # Example 1.
[0.6, 1.0], # Keypoint 1.
[0.5, 0.9], # Keypoint 2.
],
]
expected_num_keypoint_candidates = [
[2, 1],
[2, 2]
]
np.testing.assert_allclose(expected_keypoint_candidates, keypoint_cands)
np.testing.assert_allclose(expected_keypoint_scores, keypoint_scores)
np.testing.assert_array_equal(expected_num_keypoint_candidates,
num_keypoint_candidates)
def test_keypoint_candidate_prediction_per_keypoints(self):
keypoint_heatmap_np = np.zeros((2, 3, 3, 2), dtype=np.float32)
keypoint_heatmap_np[0, 0, 0, 0] = 1.0
keypoint_heatmap_np[0, 2, 1, 0] = 0.7
keypoint_heatmap_np[0, 1, 1, 0] = 0.6
keypoint_heatmap_np[0, 0, 2, 1] = 0.7
keypoint_heatmap_np[0, 1, 1, 1] = 0.3 # Filtered by low score.
keypoint_heatmap_np[0, 2, 2, 1] = 0.2
keypoint_heatmap_np[1, 1, 0, 0] = 0.6
keypoint_heatmap_np[1, 2, 1, 0] = 0.5
keypoint_heatmap_np[1, 0, 0, 0] = 0.4
keypoint_heatmap_np[1, 0, 0, 1] = 1.0
keypoint_heatmap_np[1, 0, 1, 1] = 0.9
keypoint_heatmap_np[1, 2, 0, 1] = 0.8
keypoint_heatmap_offsets_np = np.zeros((2, 3, 3, 4), dtype=np.float32)
keypoint_heatmap_offsets_np[0, 0, 0] = [0.5, 0.25, 0.0, 0.0]
keypoint_heatmap_offsets_np[0, 2, 1] = [-0.25, 0.5, 0.0, 0.0]
keypoint_heatmap_offsets_np[0, 1, 1] = [0.0, 0.0, 0.0, 0.0]
keypoint_heatmap_offsets_np[0, 0, 2] = [0.0, 0.0, 1.0, 0.0]
keypoint_heatmap_offsets_np[0, 2, 2] = [0.0, 0.0, 1.0, 1.0]
keypoint_heatmap_offsets_np[1, 1, 0] = [0.25, 0.5, 0.0, 0.0]
keypoint_heatmap_offsets_np[1, 2, 1] = [0.5, 0.0, 0.0, 0.0]
keypoint_heatmap_offsets_np[1, 0, 0] = [0.0, 0.0, 0.0, -0.5]
keypoint_heatmap_offsets_np[1, 0, 1] = [0.0, 0.0, 0.5, -0.5]
keypoint_heatmap_offsets_np[1, 2, 0] = [0.0, 0.0, -1.0, -0.5]
def graph_fn():
keypoint_heatmap = tf.constant(keypoint_heatmap_np, dtype=tf.float32)
keypoint_heatmap_offsets = tf.constant(
keypoint_heatmap_offsets_np, dtype=tf.float32)
keypoint_cands, keypoint_scores, num_keypoint_candidates = (
cnma.prediction_tensors_to_keypoint_candidates(
keypoint_heatmap,
keypoint_heatmap_offsets,
keypoint_score_threshold=0.5,
max_pool_kernel_size=1,
max_candidates=2))
return keypoint_cands, keypoint_scores, num_keypoint_candidates
(keypoint_cands, keypoint_scores,
num_keypoint_candidates) = self.execute(graph_fn, [])
expected_keypoint_candidates = [
[ # Example 0.
[[0.5, 0.25], [1.0, 2.0]], # Candidate 1 of keypoint 1, 2.
[[1.75, 1.5], [1.0, 1.0]], # Candidate 2 of keypoint 1, 2.
],
[ # Example 1.
[[1.25, 0.5], [0.0, -0.5]], # Candidate 1 of keypoint 1, 2.
[[2.5, 1.0], [0.5, 0.5]], # Candidate 2 of keypoint 1, 2.
],
]
expected_keypoint_scores = [
[ # Example 0.
[1.0, 0.7], # Candidate 1 scores of keypoint 1, 2.
[0.7, 0.3], # Candidate 2 scores of keypoint 1, 2.
],
[ # Example 1.
[0.6, 1.0], # Candidate 1 scores of keypoint 1, 2.
[0.5, 0.9], # Candidate 2 scores of keypoint 1, 2.
],
]
expected_num_keypoint_candidates = [
[2, 1],
[2, 2]
]
np.testing.assert_allclose(expected_keypoint_candidates, keypoint_cands)
np.testing.assert_allclose(expected_keypoint_scores, keypoint_scores)
np.testing.assert_array_equal(expected_num_keypoint_candidates,
num_keypoint_candidates)
def test_regressed_keypoints_at_object_centers(self):
batch_size = 2
num_keypoints = 5
num_instances = 6
regressed_keypoint_feature_map_np = np.random.randn(
batch_size, 10, 10, 2 * num_keypoints).astype(np.float32)
y_indices = np.random.choice(10, (batch_size, num_instances))
x_indices = np.random.choice(10, (batch_size, num_instances))
offsets = np.stack([y_indices, x_indices], axis=2).astype(np.float32)
def graph_fn():
regressed_keypoint_feature_map = tf.constant(
regressed_keypoint_feature_map_np, dtype=tf.float32)
gathered_regressed_keypoints = (
cnma.regressed_keypoints_at_object_centers(
regressed_keypoint_feature_map,
tf.constant(y_indices, dtype=tf.int32),
tf.constant(x_indices, dtype=tf.int32)))
return gathered_regressed_keypoints
gathered_regressed_keypoints = self.execute(graph_fn, [])
expected_gathered_keypoints_0 = regressed_keypoint_feature_map_np[
0, y_indices[0], x_indices[0], :]
expected_gathered_keypoints_1 = regressed_keypoint_feature_map_np[
1, y_indices[1], x_indices[1], :]
expected_gathered_keypoints = np.stack([
expected_gathered_keypoints_0,
expected_gathered_keypoints_1], axis=0)
expected_gathered_keypoints = np.reshape(
expected_gathered_keypoints,
[batch_size, num_instances, num_keypoints, 2])
expected_gathered_keypoints += np.expand_dims(offsets, axis=2)
expected_gathered_keypoints = np.reshape(
expected_gathered_keypoints,
[batch_size, num_instances, -1])
np.testing.assert_allclose(expected_gathered_keypoints,
gathered_regressed_keypoints)
@parameterized.parameters(
{'candidate_ranking_mode': 'min_distance'},
{'candidate_ranking_mode': 'score_distance_ratio'},
)
def test_refine_keypoints(self, candidate_ranking_mode):
regressed_keypoints_np = np.array(
[
# Example 0.
[
[[2.0, 2.0], [6.0, 10.0], [14.0, 7.0]], # Instance 0.
[[0.0, 6.0], [3.0, 3.0], [5.0, 7.0]], # Instance 1.
],
# Example 1.
[
[[6.0, 2.0], [0.0, 0.0], [0.1, 0.1]], # Instance 0.
[[6.0, 2.5], [5.0, 5.0], [9.0, 3.0]], # Instance 1.
],
], dtype=np.float32)
keypoint_candidates_np = np.array(
[
# Example 0.
[
[[2.0, 2.5], [6.0, 10.5], [4.0, 7.0]], # Candidate 0.
[[1.0, 8.0], [0.0, 0.0], [2.0, 2.0]], # Candidate 1.
[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]], # Candidate 2.
],
# Example 1.
[
[[6.0, 1.5], [0.1, 0.4], [0.0, 0.0]], # Candidate 0.
[[1.0, 4.0], [0.0, 0.3], [0.0, 0.0]], # Candidate 1.
[[0.0, 0.0], [0.1, 0.3], [0.0, 0.0]], # Candidate 2.
]
], dtype=np.float32)
keypoint_scores_np = np.array(
[
# Example 0.
[
[0.8, 0.9, 1.0], # Candidate 0.
[0.6, 0.1, 0.9], # Candidate 1.
[0.0, 0.0, 0.0], # Candidate 1.
],
# Example 1.
[
[0.7, 0.3, 0.0], # Candidate 0.
[0.6, 0.1, 0.0], # Candidate 1.
[0.0, 0.28, 0.0], # Candidate 1.
]
], dtype=np.float32)
num_keypoints_candidates_np = np.array(
[
# Example 0.
[2, 2, 2],
# Example 1.
[2, 3, 0],
], dtype=np.int32)
unmatched_keypoint_score = 0.1
def graph_fn():
regressed_keypoints = tf.constant(
regressed_keypoints_np, dtype=tf.float32)
keypoint_candidates = tf.constant(
keypoint_candidates_np, dtype=tf.float32)
keypoint_scores = tf.constant(keypoint_scores_np, dtype=tf.float32)
num_keypoint_candidates = tf.constant(num_keypoints_candidates_np,
dtype=tf.int32)
refined_keypoints, refined_scores = cnma.refine_keypoints(
regressed_keypoints, keypoint_candidates, keypoint_scores,
num_keypoint_candidates, bboxes=None,
unmatched_keypoint_score=unmatched_keypoint_score,
box_scale=1.2, candidate_search_scale=0.3,
candidate_ranking_mode=candidate_ranking_mode)
return refined_keypoints, refined_scores
refined_keypoints, refined_scores = self.execute(graph_fn, [])
if candidate_ranking_mode == 'min_distance':
expected_refined_keypoints = np.array(
[
# Example 0.
[
[[2.0, 2.5], [6.0, 10.5], [14.0, 7.0]], # Instance 0.
[[0.0, 6.0], [3.0, 3.0], [4.0, 7.0]], # Instance 1.
],
# Example 1.
[
[[6.0, 1.5], [0.0, 0.3], [0.1, 0.1]], # Instance 0.
[[6.0, 2.5], [5.0, 5.0], [9.0, 3.0]], # Instance 1.
],
], dtype=np.float32)
expected_refined_scores = np.array(
[
# Example 0.
[
[0.8, 0.9, unmatched_keypoint_score], # Instance 0.
[unmatched_keypoint_score, # Instance 1.
unmatched_keypoint_score, 1.0],
],
# Example 1.
[
[0.7, 0.1, unmatched_keypoint_score], # Instance 0.
[unmatched_keypoint_score, # Instance 1.
0.1, unmatched_keypoint_score],
],
], dtype=np.float32)
else:
expected_refined_keypoints = np.array(
[
# Example 0.
[
[[2.0, 2.5], [6.0, 10.5], [14.0, 7.0]], # Instance 0.
[[0.0, 6.0], [3.0, 3.0], [4.0, 7.0]], # Instance 1.
],
# Example 1.
[
[[6.0, 1.5], [0.1, 0.3], [0.1, 0.1]], # Instance 0.
[[6.0, 2.5], [5.0, 5.0], [9.0, 3.0]], # Instance 1.
],
], dtype=np.float32)
expected_refined_scores = np.array(
[
# Example 0.
[
[0.8, 0.9, unmatched_keypoint_score], # Instance 0.
[unmatched_keypoint_score, # Instance 1.
unmatched_keypoint_score, 1.0],
],
# Example 1.
[
[0.7, 0.28, unmatched_keypoint_score], # Instance 0.
[unmatched_keypoint_score, # Instance 1.
0.1, unmatched_keypoint_score],
],
], dtype=np.float32)
np.testing.assert_allclose(expected_refined_keypoints, refined_keypoints)
np.testing.assert_allclose(expected_refined_scores, refined_scores)
def test_refine_keypoints_with_bboxes(self):
regressed_keypoints_np = np.array(
[
# Example 0.
[
[[2.0, 2.0], [6.0, 10.0], [14.0, 7.0]], # Instance 0.
[[0.0, 6.0], [3.0, 3.0], [5.0, 7.0]], # Instance 1.
],
# Example 1.
[
[[6.0, 2.0], [0.0, 0.0], [0.1, 0.1]], # Instance 0.
[[6.0, 2.5], [5.0, 5.0], [9.0, 3.0]], # Instance 1.
],
], dtype=np.float32)
keypoint_candidates_np = np.array(
[
# Example 0.
[
[[2.0, 2.5], [6.0, 10.5], [4.0, 7.0]], # Candidate 0.
[[1.0, 8.0], [0.0, 0.0], [2.0, 2.0]], # Candidate 1.
],
# Example 1.
[
[[6.0, 1.5], [5.0, 5.0], [0.0, 0.0]], # Candidate 0.
[[1.0, 4.0], [0.0, 0.3], [0.0, 0.0]], # Candidate 1.
]
], dtype=np.float32)
keypoint_scores_np = np.array(
[
# Example 0.
[
[0.8, 0.9, 1.0], # Candidate 0.
[0.6, 0.1, 0.9], # Candidate 1.
],
# Example 1.
[
[0.7, 0.4, 0.0], # Candidate 0.
[0.6, 0.1, 0.0], # Candidate 1.
]
], dtype=np.float32)
num_keypoints_candidates_np = np.array(
[
# Example 0.
[2, 2, 2],
# Example 1.
[2, 2, 0],
], dtype=np.int32)
bboxes_np = np.array(
[
# Example 0.
[
[2.0, 2.0, 14.0, 10.0], # Instance 0.
[0.0, 3.0, 5.0, 7.0], # Instance 1.
],
# Example 1.
[
[0.0, 0.0, 6.0, 2.0], # Instance 0.
[5.0, 1.4, 9.0, 5.0], # Instance 1.
],
], dtype=np.float32)
unmatched_keypoint_score = 0.1
def graph_fn():
regressed_keypoints = tf.constant(
regressed_keypoints_np, dtype=tf.float32)
keypoint_candidates = tf.constant(
keypoint_candidates_np, dtype=tf.float32)
keypoint_scores = tf.constant(keypoint_scores_np, dtype=tf.float32)
num_keypoint_candidates = tf.constant(num_keypoints_candidates_np,
dtype=tf.int32)
bboxes = tf.constant(bboxes_np, dtype=tf.float32)
refined_keypoints, refined_scores = cnma.refine_keypoints(
regressed_keypoints, keypoint_candidates, keypoint_scores,
num_keypoint_candidates, bboxes=bboxes,
unmatched_keypoint_score=unmatched_keypoint_score,
box_scale=1.0, candidate_search_scale=0.3)
return refined_keypoints, refined_scores
refined_keypoints, refined_scores = self.execute(graph_fn, [])
expected_refined_keypoints = np.array(
[
# Example 0.
[
[[2.0, 2.5], [6.0, 10.0], [14.0, 7.0]], # Instance 0.
[[0.0, 6.0], [3.0, 3.0], [4.0, 7.0]], # Instance 1.
],
# Example 1.
[
[[6.0, 1.5], [0.0, 0.3], [0.1, 0.1]], # Instance 0.
[[6.0, 1.5], [5.0, 5.0], [9.0, 3.0]], # Instance 1.
],
], dtype=np.float32)
expected_refined_scores = np.array(
[
# Example 0.
[
[0.8, unmatched_keypoint_score, # Instance 0.
unmatched_keypoint_score],
[unmatched_keypoint_score, # Instance 1.
unmatched_keypoint_score, 1.0],
],
# Example 1.
[
[0.7, 0.1, unmatched_keypoint_score], # Instance 0.
[0.7, 0.4, unmatched_keypoint_score], # Instance 1.
],
], dtype=np.float32)
np.testing.assert_allclose(expected_refined_keypoints, refined_keypoints)
np.testing.assert_allclose(expected_refined_scores, refined_scores)
def test_pad_to_full_keypoint_dim(self):
batch_size = 4
num_instances = 8
num_keypoints = 2
keypoint_inds = [1, 3]
num_total_keypoints = 5
kpt_coords_np = np.random.randn(batch_size, num_instances, num_keypoints, 2)
kpt_scores_np = np.random.randn(batch_size, num_instances, num_keypoints)
def graph_fn():
kpt_coords = tf.constant(kpt_coords_np)
kpt_scores = tf.constant(kpt_scores_np)
kpt_coords_padded, kpt_scores_padded = (
cnma._pad_to_full_keypoint_dim(
kpt_coords, kpt_scores, keypoint_inds, num_total_keypoints))
return kpt_coords_padded, kpt_scores_padded
kpt_coords_padded, kpt_scores_padded = self.execute(graph_fn, [])
self.assertAllEqual([batch_size, num_instances, num_total_keypoints, 2],
kpt_coords_padded.shape)
self.assertAllEqual([batch_size, num_instances, num_total_keypoints],
kpt_scores_padded.shape)
for i, kpt_ind in enumerate(keypoint_inds):
np.testing.assert_allclose(kpt_coords_np[:, :, i, :],
kpt_coords_padded[:, :, kpt_ind, :])
np.testing.assert_allclose(kpt_scores_np[:, :, i],
kpt_scores_padded[:, :, kpt_ind])
def test_pad_to_full_instance_dim(self):
batch_size = 4
max_instances = 8
num_keypoints = 6
num_instances = 2
instance_inds = [1, 3]
kpt_coords_np = np.random.randn(batch_size, num_instances, num_keypoints, 2)
kpt_scores_np = np.random.randn(batch_size, num_instances, num_keypoints)
def graph_fn():
kpt_coords = tf.constant(kpt_coords_np)
kpt_scores = tf.constant(kpt_scores_np)
kpt_coords_padded, kpt_scores_padded = (
cnma._pad_to_full_instance_dim(
kpt_coords, kpt_scores, instance_inds, max_instances))
return kpt_coords_padded, kpt_scores_padded
kpt_coords_padded, kpt_scores_padded = self.execute(graph_fn, [])
self.assertAllEqual([batch_size, max_instances, num_keypoints, 2],
kpt_coords_padded.shape)
self.assertAllEqual([batch_size, max_instances, num_keypoints],
kpt_scores_padded.shape)
for i, inst_ind in enumerate(instance_inds):
np.testing.assert_allclose(kpt_coords_np[:, i, :, :],
kpt_coords_padded[:, inst_ind, :, :])
np.testing.assert_allclose(kpt_scores_np[:, i, :],
kpt_scores_padded[:, inst_ind, :])
# Common parameters for setting up testing examples across tests.
_NUM_CLASSES = 10
_KEYPOINT_INDICES = [0, 1, 2, 3]
_NUM_KEYPOINTS = len(_KEYPOINT_INDICES)
_TASK_NAME = 'human_pose'
def get_fake_center_params():
"""Returns the fake object center parameter namedtuple."""
return cnma.ObjectCenterParams(
classification_loss=losses.WeightedSigmoidClassificationLoss(),
object_center_loss_weight=1.0,
min_box_overlap_iou=1.0,
max_box_predictions=5,
use_labeled_classes=False)
def get_fake_od_params():
"""Returns the fake object detection parameter namedtuple."""
return cnma.ObjectDetectionParams(
localization_loss=losses.L1LocalizationLoss(),
offset_loss_weight=1.0,
scale_loss_weight=0.1)
def get_fake_kp_params():
"""Returns the fake keypoint estimation parameter namedtuple."""
return cnma.KeypointEstimationParams(
task_name=_TASK_NAME,
class_id=1,
keypoint_indices=_KEYPOINT_INDICES,
keypoint_std_dev=[0.00001] * len(_KEYPOINT_INDICES),
classification_loss=losses.WeightedSigmoidClassificationLoss(),
localization_loss=losses.L1LocalizationLoss(),
keypoint_candidate_score_threshold=0.1)
def get_fake_mask_params():
"""Returns the fake mask estimation parameter namedtuple."""
return cnma.MaskParams(
classification_loss=losses.WeightedSoftmaxClassificationLoss(),
task_loss_weight=1.0,
mask_height=4,
mask_width=4)
def build_center_net_meta_arch(build_resnet=False):
"""Builds the CenterNet meta architecture."""
if build_resnet:
feature_extractor = (
center_net_resnet_feature_extractor.CenterNetResnetFeatureExtractor(
'resnet_v2_101'))
else:
feature_extractor = DummyFeatureExtractor(
channel_means=(1.0, 2.0, 3.0),
channel_stds=(10., 20., 30.),
bgr_ordering=False,
num_feature_outputs=2,
stride=4)
image_resizer_fn = functools.partial(
preprocessor.resize_to_range,
min_dimension=128,
max_dimension=128,
pad_to_max_dimesnion=True)
return cnma.CenterNetMetaArch(
is_training=True,
add_summaries=False,
num_classes=_NUM_CLASSES,
feature_extractor=feature_extractor,
image_resizer_fn=image_resizer_fn,
object_center_params=get_fake_center_params(),
object_detection_params=get_fake_od_params(),
keypoint_params_dict={_TASK_NAME: get_fake_kp_params()},
mask_params=get_fake_mask_params())
def _logit(p):
return np.log(
(p + np.finfo(np.float32).eps) / (1 - p + np.finfo(np.float32).eps))
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CenterNetMetaArchLibTest(test_case.TestCase):
"""Test for CenterNet meta architecture related functions."""
def test_get_keypoint_name(self):
self.assertEqual('human_pose/keypoint_offset',
cnma.get_keypoint_name('human_pose', 'keypoint_offset'))
def test_get_num_instances_from_weights(self):
weight1 = tf.constant([0.0, 0.0, 0.0], dtype=tf.float32)
weight2 = tf.constant([0.5, 0.9, 0.0], dtype=tf.float32)
weight3 = tf.constant([0.0, 0.0, 1.0], dtype=tf.float32)
def graph_fn_1():
# Total of three elements with non-zero values.
num_instances = cnma.get_num_instances_from_weights(
[weight1, weight2, weight3])
return num_instances
num_instances = self.execute(graph_fn_1, [])
self.assertAlmostEqual(3, num_instances)
# No non-zero value in the weights. Return minimum value: 1.
def graph_fn_2():
# Total of three elements with non-zero values.
num_instances = cnma.get_num_instances_from_weights([weight1, weight1])
return num_instances
num_instances = self.execute(graph_fn_2, [])
self.assertAlmostEqual(1, num_instances)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CenterNetMetaArchTest(test_case.TestCase, parameterized.TestCase):
"""Tests for the CenterNet meta architecture."""
def test_construct_prediction_heads(self):
model = build_center_net_meta_arch()
fake_feature_map = np.zeros((4, 128, 128, 8))
# Check the dictionary contains expected keys and corresponding heads with
# correct dimensions.
# "object center" head:
output = model._prediction_head_dict[cnma.OBJECT_CENTER][-1](
fake_feature_map)
self.assertEqual((4, 128, 128, _NUM_CLASSES), output.shape)
# "object scale" (height/width) head:
output = model._prediction_head_dict[cnma.BOX_SCALE][-1](fake_feature_map)
self.assertEqual((4, 128, 128, 2), output.shape)
# "object offset" head:
output = model._prediction_head_dict[cnma.BOX_OFFSET][-1](fake_feature_map)
self.assertEqual((4, 128, 128, 2), output.shape)
# "keypoint offset" head:
output = model._prediction_head_dict[
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET)][-1](
fake_feature_map)
self.assertEqual((4, 128, 128, 2), output.shape)
# "keypoint heatmap" head:
output = model._prediction_head_dict[cnma.get_keypoint_name(
_TASK_NAME, cnma.KEYPOINT_HEATMAP)][-1](
fake_feature_map)
self.assertEqual((4, 128, 128, _NUM_KEYPOINTS), output.shape)
# "keypoint regression" head:
output = model._prediction_head_dict[cnma.get_keypoint_name(
_TASK_NAME, cnma.KEYPOINT_REGRESSION)][-1](
fake_feature_map)
self.assertEqual((4, 128, 128, 2 * _NUM_KEYPOINTS), output.shape)
# "mask" head:
output = model._prediction_head_dict[cnma.SEGMENTATION_HEATMAP][-1](
fake_feature_map)
self.assertEqual((4, 128, 128, _NUM_CLASSES), output.shape)
def test_initialize_target_assigners(self):
model = build_center_net_meta_arch()
assigner_dict = model._initialize_target_assigners(
stride=2,
min_box_overlap_iou=0.7)
# Check whether the correponding target assigner class is initialized.
# object center target assigner:
self.assertIsInstance(assigner_dict[cnma.OBJECT_CENTER],
cn_assigner.CenterNetCenterHeatmapTargetAssigner)
# object detection target assigner:
self.assertIsInstance(assigner_dict[cnma.DETECTION_TASK],
cn_assigner.CenterNetBoxTargetAssigner)
# keypoint estimation target assigner:
self.assertIsInstance(assigner_dict[_TASK_NAME],
cn_assigner.CenterNetKeypointTargetAssigner)
# mask estimation target assigner:
self.assertIsInstance(assigner_dict[cnma.SEGMENTATION_TASK],
cn_assigner.CenterNetMaskTargetAssigner)
def test_predict(self):
"""Test the predict function."""
model = build_center_net_meta_arch()
def graph_fn():
prediction_dict = model.predict(tf.zeros([2, 128, 128, 3]), None)
return prediction_dict
prediction_dict = self.execute(graph_fn, [])
self.assertEqual(prediction_dict['preprocessed_inputs'].shape,
(2, 128, 128, 3))
self.assertEqual(prediction_dict[cnma.OBJECT_CENTER][0].shape,
(2, 32, 32, _NUM_CLASSES))
self.assertEqual(prediction_dict[cnma.BOX_SCALE][0].shape,
(2, 32, 32, 2))
self.assertEqual(prediction_dict[cnma.BOX_OFFSET][0].shape,
(2, 32, 32, 2))
self.assertEqual(prediction_dict[cnma.SEGMENTATION_HEATMAP][0].shape,
(2, 32, 32, _NUM_CLASSES))
def test_loss(self):
"""Test the loss function."""
groundtruth_dict = get_fake_groundtruth_dict(16, 32, 4)
model = build_center_net_meta_arch()
model.provide_groundtruth(
groundtruth_boxes_list=groundtruth_dict[fields.BoxListFields.boxes],
groundtruth_weights_list=groundtruth_dict[fields.BoxListFields.weights],
groundtruth_classes_list=groundtruth_dict[fields.BoxListFields.classes],
groundtruth_keypoints_list=groundtruth_dict[
fields.BoxListFields.keypoints],
groundtruth_masks_list=groundtruth_dict[
fields.BoxListFields.masks])
prediction_dict = get_fake_prediction_dict(
input_height=16, input_width=32, stride=4)
def graph_fn():
loss_dict = model.loss(prediction_dict,
tf.constant([[16, 24, 3], [16, 24, 3]]))
return loss_dict
loss_dict = self.execute(graph_fn, [])
# The prediction and groundtruth are curated to produce very low loss.
self.assertGreater(
0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX, cnma.OBJECT_CENTER)])
self.assertGreater(
0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX, cnma.BOX_SCALE)])
self.assertGreater(
0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX, cnma.BOX_OFFSET)])
self.assertGreater(
0.01,
loss_dict['%s/%s' %
(cnma.LOSS_KEY_PREFIX,
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_HEATMAP))])
self.assertGreater(
0.01,
loss_dict['%s/%s' %
(cnma.LOSS_KEY_PREFIX,
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET))])
self.assertGreater(
0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
cnma.get_keypoint_name(
_TASK_NAME, cnma.KEYPOINT_REGRESSION))])
self.assertGreater(
0.01, loss_dict['%s/%s' % (cnma.LOSS_KEY_PREFIX,
cnma.SEGMENTATION_HEATMAP)])
@parameterized.parameters(
{'target_class_id': 1},
{'target_class_id': 2},
)
def test_postprocess(self, target_class_id):
"""Test the postprocess function."""
model = build_center_net_meta_arch()
max_detection = model._center_params.max_box_predictions
num_keypoints = len(model._kp_params_dict[_TASK_NAME].keypoint_indices)
class_center = np.zeros((1, 32, 32, 10), dtype=np.float32)
height_width = np.zeros((1, 32, 32, 2), dtype=np.float32)
offset = np.zeros((1, 32, 32, 2), dtype=np.float32)
keypoint_heatmaps = np.zeros((1, 32, 32, num_keypoints), dtype=np.float32)
keypoint_offsets = np.zeros((1, 32, 32, 2), dtype=np.float32)
keypoint_regression = np.random.randn(1, 32, 32, num_keypoints * 2)
class_probs = np.zeros(10)
class_probs[target_class_id] = _logit(0.75)
class_center[0, 16, 16] = class_probs
height_width[0, 16, 16] = [5, 10]
offset[0, 16, 16] = [.25, .5]
keypoint_regression[0, 16, 16] = [
-1., -1.,
-1., 1.,
1., -1.,
1., 1.]
keypoint_heatmaps[0, 14, 14, 0] = _logit(0.9)
keypoint_heatmaps[0, 14, 18, 1] = _logit(0.9)
keypoint_heatmaps[0, 18, 14, 2] = _logit(0.9)
keypoint_heatmaps[0, 18, 18, 3] = _logit(0.05) # Note the low score.
segmentation_heatmap = np.zeros((1, 32, 32, 10), dtype=np.float32)
segmentation_heatmap[:, 14:18, 14:18, target_class_id] = 1.0
segmentation_heatmap = _logit(segmentation_heatmap)
class_center = tf.constant(class_center)
height_width = tf.constant(height_width)
offset = tf.constant(offset)
keypoint_heatmaps = tf.constant(keypoint_heatmaps, dtype=tf.float32)
keypoint_offsets = tf.constant(keypoint_offsets, dtype=tf.float32)
keypoint_regression = tf.constant(keypoint_regression, dtype=tf.float32)
segmentation_heatmap = tf.constant(segmentation_heatmap, dtype=tf.float32)
prediction_dict = {
cnma.OBJECT_CENTER: [class_center],
cnma.BOX_SCALE: [height_width],
cnma.BOX_OFFSET: [offset],
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_HEATMAP):
[keypoint_heatmaps],
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET):
[keypoint_offsets],
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_REGRESSION):
[keypoint_regression],
cnma.SEGMENTATION_HEATMAP: [segmentation_heatmap],
}
def graph_fn():
detections = model.postprocess(prediction_dict,
tf.constant([[128, 128, 3]]))
return detections
detections = self.execute_cpu(graph_fn, [])
self.assertAllClose(detections['detection_boxes'][0, 0],
np.array([55, 46, 75, 86]) / 128.0)
self.assertAllClose(detections['detection_scores'][0],
[.75, .5, .5, .5, .5])
self.assertEqual(detections['detection_classes'][0, 0], target_class_id)
self.assertEqual(detections['num_detections'], [5])
self.assertAllEqual([1, max_detection, num_keypoints, 2],
detections['detection_keypoints'].shape)
self.assertAllEqual([1, max_detection, num_keypoints],
detections['detection_keypoint_scores'].shape)
self.assertAllEqual([1, max_detection, 4, 4],
detections['detection_masks'].shape)
# There should be some section of the first mask (correspond to the only
# detection) with non-zero mask values.
self.assertGreater(np.sum(detections['detection_masks'][0, 0, :, :] > 0), 0)
self.assertAllEqual(
detections['detection_masks'][0, 1:, :, :],
np.zeros_like(detections['detection_masks'][0, 1:, :, :]))
if target_class_id == 1:
expected_kpts_for_obj_0 = np.array(
[[14., 14.], [14., 18.], [18., 14.], [17., 17.]]) / 32.
expected_kpt_scores_for_obj_0 = np.array(
[0.9, 0.9, 0.9, cnma.UNMATCHED_KEYPOINT_SCORE])
np.testing.assert_allclose(detections['detection_keypoints'][0][0],
expected_kpts_for_obj_0, rtol=1e-6)
np.testing.assert_allclose(detections['detection_keypoint_scores'][0][0],
expected_kpt_scores_for_obj_0, rtol=1e-6)
else:
# All keypoint outputs should be zeros.
np.testing.assert_allclose(
detections['detection_keypoints'][0][0],
np.zeros([num_keypoints, 2], np.float),
rtol=1e-6)
np.testing.assert_allclose(
detections['detection_keypoint_scores'][0][0],
np.zeros([num_keypoints], np.float),
rtol=1e-6)
def test_get_instance_indices(self):
classes = tf.constant([[0, 1, 2, 0], [2, 1, 2, 2]], dtype=tf.int32)
num_detections = tf.constant([1, 3], dtype=tf.int32)
batch_index = 1
class_id = 2
model = build_center_net_meta_arch()
valid_indices = model._get_instance_indices(
classes, num_detections, batch_index, class_id)
self.assertAllEqual(valid_indices.numpy(), [0, 2])
def get_fake_prediction_dict(input_height, input_width, stride):
"""Prepares the fake prediction dictionary."""
output_height = input_height // stride
output_width = input_width // stride
object_center = np.zeros((2, output_height, output_width, _NUM_CLASSES),
dtype=np.float32)
# Box center:
# y: floor((0.54 + 0.56) / 2 * 4) = 2,
# x: floor((0.54 + 0.56) / 2 * 8) = 4
object_center[0, 2, 4, 1] = 1.0
object_center = _logit(object_center)
# Box size:
# height: (0.56 - 0.54) * 4 = 0.08
# width: (0.56 - 0.54) * 8 = 0.16
object_scale = np.zeros((2, output_height, output_width, 2), dtype=np.float32)
object_scale[0, 2, 4] = 0.08, 0.16
# Box center offset coordinate (0.55, 0.55):
# y-offset: 0.55 * 4 - 2 = 0.2
# x-offset: 0.55 * 8 - 4 = 0.4
object_offset = np.zeros((2, output_height, output_width, 2),
dtype=np.float32)
object_offset[0, 2, 4] = 0.2, 0.4
keypoint_heatmap = np.zeros((2, output_height, output_width, _NUM_KEYPOINTS),
dtype=np.float32)
keypoint_heatmap[0, 2, 4, 1] = 1.0
keypoint_heatmap[0, 2, 4, 3] = 1.0
keypoint_heatmap = _logit(keypoint_heatmap)
keypoint_offset = np.zeros((2, output_height, output_width, 2),
dtype=np.float32)
keypoint_offset[0, 2, 4] = 0.2, 0.4
keypoint_regression = np.zeros(
(2, output_height, output_width, 2 * _NUM_KEYPOINTS), dtype=np.float32)
keypoint_regression[0, 2, 4] = 0.0, 0.0, 0.2, 0.4, 0.0, 0.0, 0.2, 0.4
mask_heatmap = np.zeros((2, output_height, output_width, _NUM_CLASSES),
dtype=np.float32)
mask_heatmap[0, 2, 4, 1] = 1.0
mask_heatmap = _logit(mask_heatmap)
prediction_dict = {
'preprocessed_inputs':
tf.zeros((2, input_height, input_width, 3)),
cnma.OBJECT_CENTER: [
tf.constant(object_center),
tf.constant(object_center)
],
cnma.BOX_SCALE: [
tf.constant(object_scale),
tf.constant(object_scale)
],
cnma.BOX_OFFSET: [
tf.constant(object_offset),
tf.constant(object_offset)
],
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_HEATMAP): [
tf.constant(keypoint_heatmap),
tf.constant(keypoint_heatmap)
],
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_OFFSET): [
tf.constant(keypoint_offset),
tf.constant(keypoint_offset)
],
cnma.get_keypoint_name(_TASK_NAME, cnma.KEYPOINT_REGRESSION): [
tf.constant(keypoint_regression),
tf.constant(keypoint_regression)
],
cnma.SEGMENTATION_HEATMAP: [
tf.constant(mask_heatmap),
tf.constant(mask_heatmap)
]
}
return prediction_dict
def get_fake_groundtruth_dict(input_height, input_width, stride):
"""Prepares the fake groundtruth dictionary."""
# A small box with center at (0.55, 0.55).
boxes = [
tf.constant([[0.54, 0.54, 0.56, 0.56]]),
tf.constant([[0.0, 0.0, 0.5, 0.5]]),
]
classes = [
tf.one_hot([1], depth=_NUM_CLASSES),
tf.one_hot([0], depth=_NUM_CLASSES),
]
weights = [
tf.constant([1.]),
tf.constant([0.]),
]
keypoints = [
tf.tile(
tf.expand_dims(
tf.constant([[float('nan'), 0.55,
float('nan'), 0.55, 0.55, 0.0]]),
axis=2),
multiples=[1, 1, 2]),
tf.tile(
tf.expand_dims(
tf.constant([[float('nan'), 0.55,
float('nan'), 0.55, 0.55, 0.0]]),
axis=2),
multiples=[1, 1, 2]),
]
labeled_classes = [
tf.one_hot([1], depth=_NUM_CLASSES) + tf.one_hot([2], depth=_NUM_CLASSES),
tf.one_hot([0], depth=_NUM_CLASSES) + tf.one_hot([1], depth=_NUM_CLASSES),
]
mask = np.zeros((1, input_height, input_width), dtype=np.float32)
mask[0, 8:8+stride, 16:16+stride] = 1
masks = [
tf.constant(mask),
tf.zeros_like(mask),
]
groundtruth_dict = {
fields.BoxListFields.boxes: boxes,
fields.BoxListFields.weights: weights,
fields.BoxListFields.classes: classes,
fields.BoxListFields.keypoints: keypoints,
fields.BoxListFields.masks: masks,
fields.InputDataFields.groundtruth_labeled_classes: labeled_classes,
}
return groundtruth_dict
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CenterNetMetaComputeLossTest(test_case.TestCase):
"""Test for CenterNet loss compuation related functions."""
def setUp(self):
self.model = build_center_net_meta_arch()
self.classification_loss_fn = self.model._center_params.classification_loss
self.localization_loss_fn = self.model._od_params.localization_loss
self.true_image_shapes = tf.constant([[16, 24, 3], [16, 24, 3]])
self.input_height = 16
self.input_width = 32
self.stride = 4
self.per_pixel_weights = self.get_per_pixel_weights(self.true_image_shapes,
self.input_height,
self.input_width,
self.stride)
self.prediction_dict = get_fake_prediction_dict(self.input_height,
self.input_width,
self.stride)
self.model._groundtruth_lists = get_fake_groundtruth_dict(
self.input_height, self.input_width, self.stride)
super(CenterNetMetaComputeLossTest, self).setUp()
def get_per_pixel_weights(self, true_image_shapes, input_height, input_width,
stride):
output_height, output_width = (input_height // stride,
input_width // stride)
# TODO(vighneshb) Explore whether using floor here is safe.
output_true_image_shapes = tf.ceil(tf.to_float(true_image_shapes) / stride)
per_pixel_weights = cnma.get_valid_anchor_weights_in_flattened_image(
output_true_image_shapes, output_height, output_width)
per_pixel_weights = tf.expand_dims(per_pixel_weights, 2)
return per_pixel_weights
def test_compute_object_center_loss(self):
def graph_fn():
loss = self.model._compute_object_center_loss(
object_center_predictions=self.prediction_dict[cnma.OBJECT_CENTER],
input_height=self.input_height,
input_width=self.input_width,
per_pixel_weights=self.per_pixel_weights)
return loss
loss = self.execute(graph_fn, [])
# The prediction and groundtruth are curated to produce very low loss.
self.assertGreater(0.01, loss)
default_value = self.model._center_params.use_only_known_classes
self.model._center_params = (
self.model._center_params._replace(use_only_known_classes=True))
loss = self.model._compute_object_center_loss(
object_center_predictions=self.prediction_dict[cnma.OBJECT_CENTER],
input_height=self.input_height,
input_width=self.input_width,
per_pixel_weights=self.per_pixel_weights)
self.model._center_params = (
self.model._center_params._replace(
use_only_known_classes=default_value))
# The prediction and groundtruth are curated to produce very low loss.
self.assertGreater(0.01, loss)
def test_compute_box_scale_and_offset_loss(self):
def graph_fn():
scale_loss, offset_loss = self.model._compute_box_scale_and_offset_loss(
scale_predictions=self.prediction_dict[cnma.BOX_SCALE],
offset_predictions=self.prediction_dict[cnma.BOX_OFFSET],
input_height=self.input_height,
input_width=self.input_width)
return scale_loss, offset_loss
scale_loss, offset_loss = self.execute(graph_fn, [])
# The prediction and groundtruth are curated to produce very low loss.
self.assertGreater(0.01, scale_loss)
self.assertGreater(0.01, offset_loss)
def test_compute_kp_heatmap_loss(self):
def graph_fn():
loss = self.model._compute_kp_heatmap_loss(
input_height=self.input_height,
input_width=self.input_width,
task_name=_TASK_NAME,
heatmap_predictions=self.prediction_dict[cnma.get_keypoint_name(
_TASK_NAME, cnma.KEYPOINT_HEATMAP)],
classification_loss_fn=self.classification_loss_fn,
per_pixel_weights=self.per_pixel_weights)
return loss
loss = self.execute(graph_fn, [])
# The prediction and groundtruth are curated to produce very low loss.
self.assertGreater(0.01, loss)
def test_compute_kp_offset_loss(self):
def graph_fn():
loss = self.model._compute_kp_offset_loss(
input_height=self.input_height,
input_width=self.input_width,
task_name=_TASK_NAME,
offset_predictions=self.prediction_dict[cnma.get_keypoint_name(
_TASK_NAME, cnma.KEYPOINT_OFFSET)],
localization_loss_fn=self.localization_loss_fn)
return loss
loss = self.execute(graph_fn, [])
# The prediction and groundtruth are curated to produce very low loss.
self.assertGreater(0.01, loss)
def test_compute_kp_regression_loss(self):
def graph_fn():
loss = self.model._compute_kp_regression_loss(
input_height=self.input_height,
input_width=self.input_width,
task_name=_TASK_NAME,
regression_predictions=self.prediction_dict[cnma.get_keypoint_name(
_TASK_NAME, cnma.KEYPOINT_REGRESSION,)],
localization_loss_fn=self.localization_loss_fn)
return loss
loss = self.execute(graph_fn, [])
# The prediction and groundtruth are curated to produce very low loss.
self.assertGreater(0.01, loss)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CenterNetMetaArchRestoreTest(test_case.TestCase):
def test_restore_map_resnet(self):
"""Test restore map for a resnet backbone."""
model = build_center_net_meta_arch(build_resnet=True)
restore_map = model.restore_map('classification')
self.assertIsInstance(restore_map['feature_extractor'], tf.keras.Model)
class DummyFeatureExtractor(cnma.CenterNetFeatureExtractor):
def __init__(self,
channel_means,
channel_stds,
bgr_ordering,
num_feature_outputs,
stride):
self._num_feature_outputs = num_feature_outputs
self._stride = stride
super(DummyFeatureExtractor, self).__init__(
channel_means=channel_means, channel_stds=channel_stds,
bgr_ordering=bgr_ordering)
def predict(self):
pass
def loss(self):
pass
def postprocess(self):
pass
def restore_map(self):
pass
def call(self, inputs):
batch_size, input_height, input_width, _ = inputs.shape
fake_output = tf.ones([
batch_size, input_height // self._stride, input_width // self._stride,
64
], dtype=tf.float32)
return [fake_output] * self._num_feature_outputs
@property
def out_stride(self):
return self._stride
@property
def num_feature_outputs(self):
return self._num_feature_outputs
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CenterNetFeatureExtractorTest(test_case.TestCase):
"""Test the base feature extractor class."""
def test_preprocess(self):
feature_extractor = DummyFeatureExtractor(
channel_means=(1.0, 2.0, 3.0),
channel_stds=(10., 20., 30.), bgr_ordering=False,
num_feature_outputs=2, stride=4)
img = np.zeros((2, 32, 32, 3))
img[:, :, :] = 11, 22, 33
def graph_fn():
output = feature_extractor.preprocess(img)
return output
output = self.execute(graph_fn, [])
self.assertAlmostEqual(output.sum(), 2 * 32 * 32 * 3)
def test_bgr_ordering(self):
feature_extractor = DummyFeatureExtractor(
channel_means=(0.0, 0.0, 0.0),
channel_stds=(1., 1., 1.), bgr_ordering=True,
num_feature_outputs=2, stride=4)
img = np.zeros((2, 32, 32, 3), dtype=np.float32)
img[:, :, :] = 1, 2, 3
def graph_fn():
output = feature_extractor.preprocess(img)
return output
output = self.execute(graph_fn, [])
self.assertAllClose(output[..., 2], 1 * np.ones((2, 32, 32)))
self.assertAllClose(output[..., 1], 2 * np.ones((2, 32, 32)))
self.assertAllClose(output[..., 0], 3 * np.ones((2, 32, 32)))
def test_default_ordering(self):
feature_extractor = DummyFeatureExtractor(
channel_means=(0.0, 0.0, 0.0),
channel_stds=(1., 1., 1.), bgr_ordering=False,
num_feature_outputs=2, stride=4)
img = np.zeros((2, 32, 32, 3), dtype=np.float32)
img[:, :, :] = 1, 2, 3
def graph_fn():
output = feature_extractor.preprocess(img)
return output
output = self.execute(graph_fn, [])
self.assertAllClose(output[..., 0], 1 * np.ones((2, 32, 32)))
self.assertAllClose(output[..., 1], 2 * np.ones((2, 32, 32)))
self.assertAllClose(output[..., 2], 3 * np.ones((2, 32, 32)))
if __name__ == '__main__':
tf.enable_v2_behavior()
tf.test.main()
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library functions for ContextRCNN."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow.compat.v1 as tf
import tf_slim as slim
# The negative value used in padding the invalid weights.
_NEGATIVE_PADDING_VALUE = -100000
def filter_weight_value(weights, values, valid_mask):
"""Filters weights and values based on valid_mask.
_NEGATIVE_PADDING_VALUE will be added to invalid elements in the weights to
avoid their contribution in softmax. 0 will be set for the invalid elements in
the values.
Args:
weights: A float Tensor of shape [batch_size, input_size, context_size].
values: A float Tensor of shape [batch_size, context_size,
projected_dimension].
valid_mask: A boolean Tensor of shape [batch_size, context_size]. True means
valid and False means invalid.
Returns:
weights: A float Tensor of shape [batch_size, input_size, context_size].
values: A float Tensor of shape [batch_size, context_size,
projected_dimension].
Raises:
ValueError: If shape of doesn't match.
"""
w_batch_size, _, w_context_size = weights.shape
v_batch_size, v_context_size, _ = values.shape
m_batch_size, m_context_size = valid_mask.shape
if w_batch_size != v_batch_size or v_batch_size != m_batch_size:
raise ValueError("Please make sure the first dimension of the input"
" tensors are the same.")
if w_context_size != v_context_size:
raise ValueError("Please make sure the third dimension of weights matches"
" the second dimension of values.")
if w_context_size != m_context_size:
raise ValueError("Please make sure the third dimension of the weights"
" matches the second dimension of the valid_mask.")
valid_mask = valid_mask[..., tf.newaxis]
# Force the invalid weights to be very negative so it won't contribute to
# the softmax.
weights += tf.transpose(
tf.cast(tf.math.logical_not(valid_mask), weights.dtype) *
_NEGATIVE_PADDING_VALUE,
perm=[0, 2, 1])
# Force the invalid values to be 0.
values *= tf.cast(valid_mask, values.dtype)
return weights, values
def compute_valid_mask(num_valid_elements, num_elements):
"""Computes mask of valid entries within padded context feature.
Args:
num_valid_elements: A int32 Tensor of shape [batch_size].
num_elements: An int32 Tensor.
Returns:
A boolean Tensor of the shape [batch_size, num_elements]. True means
valid and False means invalid.
"""
batch_size = num_valid_elements.shape[0]
element_idxs = tf.range(num_elements, dtype=tf.int32)
batch_element_idxs = tf.tile(element_idxs[tf.newaxis, ...], [batch_size, 1])
num_valid_elements = num_valid_elements[..., tf.newaxis]
valid_mask = tf.less(batch_element_idxs, num_valid_elements)
return valid_mask
def project_features(features, projection_dimension, is_training, normalize):
"""Projects features to another feature space.
Args:
features: A float Tensor of shape [batch_size, features_size,
num_features].
projection_dimension: A int32 Tensor.
is_training: A boolean Tensor (affecting batch normalization).
normalize: A boolean Tensor. If true, the output features will be l2
normalized on the last dimension.
Returns:
A float Tensor of shape [batch, features_size, projection_dimension].
"""
# TODO(guanhangwu) Figure out a better way of specifying the batch norm
# params.
batch_norm_params = {
"is_training": is_training,
"decay": 0.97,
"epsilon": 0.001,
"center": True,
"scale": True
}
batch_size, _, num_features = features.shape
features = tf.reshape(features, [-1, num_features])
projected_features = slim.fully_connected(
features,
num_outputs=projection_dimension,
activation_fn=tf.nn.relu6,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params)
projected_features = tf.reshape(projected_features,
[batch_size, -1, projection_dimension])
if normalize:
projected_features = tf.math.l2_normalize(projected_features, axis=-1)
return projected_features
def attention_block(input_features, context_features, bottleneck_dimension,
output_dimension, attention_temperature, valid_mask,
is_training):
"""Generic attention block.
Args:
input_features: A float Tensor of shape [batch_size, input_size,
num_input_features].
context_features: A float Tensor of shape [batch_size, context_size,
num_context_features].
bottleneck_dimension: A int32 Tensor representing the bottleneck dimension
for intermediate projections.
output_dimension: A int32 Tensor representing the last dimension of the
output feature.
attention_temperature: A float Tensor. It controls the temperature of the
softmax for weights calculation. The formula for calculation as follows:
weights = exp(weights / temperature) / sum(exp(weights / temperature))
valid_mask: A boolean Tensor of shape [batch_size, context_size].
is_training: A boolean Tensor (affecting batch normalization).
Returns:
A float Tensor of shape [batch_size, input_size, output_dimension].
"""
with tf.variable_scope("AttentionBlock"):
queries = project_features(
input_features, bottleneck_dimension, is_training, normalize=True)
keys = project_features(
context_features, bottleneck_dimension, is_training, normalize=True)
values = project_features(
context_features, bottleneck_dimension, is_training, normalize=True)
weights = tf.matmul(queries, keys, transpose_b=True)
weights, values = filter_weight_value(weights, values, valid_mask)
weights = tf.nn.softmax(weights / attention_temperature)
features = tf.matmul(weights, values)
output_features = project_features(
features, output_dimension, is_training, normalize=False)
return output_features
def compute_box_context_attention(box_features, context_features,
valid_context_size, bottleneck_dimension,
attention_temperature, is_training):
"""Computes the attention feature from the context given a batch of box.
Args:
box_features: A float Tensor of shape [batch_size, max_num_proposals,
height, width, channels]. It is pooled features from first stage
proposals.
context_features: A float Tensor of shape [batch_size, context_size,
num_context_features].
valid_context_size: A int32 Tensor of shape [batch_size].
bottleneck_dimension: A int32 Tensor representing the bottleneck dimension
for intermediate projections.
attention_temperature: A float Tensor. It controls the temperature of the
softmax for weights calculation. The formula for calculation as follows:
weights = exp(weights / temperature) / sum(exp(weights / temperature))
is_training: A boolean Tensor (affecting batch normalization).
Returns:
A float Tensor of shape [batch_size, max_num_proposals, 1, 1, channels].
"""
_, context_size, _ = context_features.shape
valid_mask = compute_valid_mask(valid_context_size, context_size)
channels = box_features.shape[-1]
# Average pools over height and width dimension so that the shape of
# box_features becomes [batch_size, max_num_proposals, channels].
box_features = tf.reduce_mean(box_features, [2, 3])
output_features = attention_block(box_features, context_features,
bottleneck_dimension, channels.value,
attention_temperature, valid_mask,
is_training)
# Expands the dimension back to match with the original feature map.
output_features = output_features[:, :, tf.newaxis, tf.newaxis, :]
return output_features
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for context_rcnn_lib."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
from absl.testing import parameterized
import tensorflow.compat.v1 as tf
from object_detection.meta_architectures import context_rcnn_lib
from object_detection.utils import test_case
from object_detection.utils import tf_version
_NEGATIVE_PADDING_VALUE = -100000
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase,
tf.test.TestCase):
"""Tests for the functions in context_rcnn_lib."""
def test_compute_valid_mask(self):
num_elements = tf.constant(3, tf.int32)
num_valid_elementss = tf.constant((1, 2), tf.int32)
valid_mask = context_rcnn_lib.compute_valid_mask(num_valid_elementss,
num_elements)
expected_valid_mask = tf.constant([[1, 0, 0], [1, 1, 0]], tf.float32)
self.assertAllEqual(valid_mask, expected_valid_mask)
def test_filter_weight_value(self):
weights = tf.ones((2, 3, 2), tf.float32) * 4
values = tf.ones((2, 2, 4), tf.float32)
valid_mask = tf.constant([[True, True], [True, False]], tf.bool)
filtered_weights, filtered_values = context_rcnn_lib.filter_weight_value(
weights, values, valid_mask)
expected_weights = tf.constant([[[4, 4], [4, 4], [4, 4]],
[[4, _NEGATIVE_PADDING_VALUE + 4],
[4, _NEGATIVE_PADDING_VALUE + 4],
[4, _NEGATIVE_PADDING_VALUE + 4]]])
expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
[[1, 1, 1, 1], [0, 0, 0, 0]]])
self.assertAllEqual(filtered_weights, expected_weights)
self.assertAllEqual(filtered_values, expected_values)
# Changes the valid_mask so the results will be different.
valid_mask = tf.constant([[True, True], [False, False]], tf.bool)
filtered_weights, filtered_values = context_rcnn_lib.filter_weight_value(
weights, values, valid_mask)
expected_weights = tf.constant(
[[[4, 4], [4, 4], [4, 4]],
[[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4]]])
expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
[[0, 0, 0, 0], [0, 0, 0, 0]]])
self.assertAllEqual(filtered_weights, expected_weights)
self.assertAllEqual(filtered_values, expected_values)
@parameterized.parameters((2, True, True), (2, False, True),
(10, True, False), (10, False, False))
def test_project_features(self, projection_dimension, is_training, normalize):
features = tf.ones([2, 3, 4], tf.float32)
projected_features = context_rcnn_lib.project_features(
features,
projection_dimension,
is_training=is_training,
normalize=normalize)
# Makes sure the shape is correct.
self.assertAllEqual(projected_features.shape, [2, 3, projection_dimension])
@parameterized.parameters(
(2, 10, 1),
(3, 10, 2),
(4, 20, 3),
(5, 20, 4),
(7, 20, 5),
)
def test_attention_block(self, bottleneck_dimension, output_dimension,
attention_temperature):
input_features = tf.ones([2, 3, 4], tf.float32)
context_features = tf.ones([2, 2, 3], tf.float32)
valid_mask = tf.constant([[True, True], [False, False]], tf.bool)
is_training = False
output_features = context_rcnn_lib.attention_block(
input_features, context_features, bottleneck_dimension,
output_dimension, attention_temperature, valid_mask, is_training)
# Makes sure the shape is correct.
self.assertAllEqual(output_features.shape, [2, 3, output_dimension])
@parameterized.parameters(True, False)
def test_compute_box_context_attention(self, is_training):
box_features = tf.ones([2, 3, 4, 4, 4], tf.float32)
context_features = tf.ones([2, 5, 6], tf.float32)
valid_context_size = tf.constant((2, 3), tf.int32)
bottleneck_dimension = 10
attention_temperature = 1
attention_features = context_rcnn_lib.compute_box_context_attention(
box_features, context_features, valid_context_size,
bottleneck_dimension, attention_temperature, is_training)
# Makes sure the shape is correct.
self.assertAllEqual(attention_features.shape, [2, 3, 1, 1, 4])
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Context R-CNN meta-architecture definition.
This adds the ability to use attention into contextual features within the
Faster R-CNN object detection framework to improve object detection performance.
See https://arxiv.org/abs/1912.03538 for more information.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
from object_detection.core import standard_fields as fields
from object_detection.meta_architectures import context_rcnn_lib
from object_detection.meta_architectures import faster_rcnn_meta_arch
class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
"""Context R-CNN Meta-architecture definition."""
def __init__(self,
is_training,
num_classes,
image_resizer_fn,
feature_extractor,
number_of_stages,
first_stage_anchor_generator,
first_stage_target_assigner,
first_stage_atrous_rate,
first_stage_box_predictor_arg_scope_fn,
first_stage_box_predictor_kernel_size,
first_stage_box_predictor_depth,
first_stage_minibatch_size,
first_stage_sampler,
first_stage_non_max_suppression_fn,
first_stage_max_proposals,
first_stage_localization_loss_weight,
first_stage_objectness_loss_weight,
crop_and_resize_fn,
initial_crop_size,
maxpool_kernel_size,
maxpool_stride,
second_stage_target_assigner,
second_stage_mask_rcnn_box_predictor,
second_stage_batch_size,
second_stage_sampler,
second_stage_non_max_suppression_fn,
second_stage_score_conversion_fn,
second_stage_localization_loss_weight,
second_stage_classification_loss_weight,
second_stage_classification_loss,
second_stage_mask_prediction_loss_weight=1.0,
hard_example_miner=None,
parallel_iterations=16,
add_summaries=True,
clip_anchors_to_image=False,
use_static_shapes=False,
resize_masks=True,
freeze_batchnorm=False,
return_raw_detections_during_predict=False,
output_final_box_features=False,
attention_bottleneck_dimension=None,
attention_temperature=None):
"""ContextRCNNMetaArch Constructor.
Args:
is_training: A boolean indicating whether the training version of the
computation graph should be constructed.
num_classes: Number of classes. Note that num_classes *does not*
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
image_resizer_fn: A callable for image resizing. This callable
takes a rank-3 image tensor of shape [height, width, channels]
(corresponding to a single image), an optional rank-3 instance mask
tensor of shape [num_masks, height, width] and returns a resized rank-3
image tensor, a resized mask tensor if one was provided in the input. In
addition this callable must also return a 1-D tensor of the form
[height, width, channels] containing the size of the true image, as the
image resizer can perform zero padding. See protos/image_resizer.proto.
feature_extractor: A FasterRCNNFeatureExtractor object.
number_of_stages: An integer values taking values in {1, 2, 3}. If
1, the function will construct only the Region Proposal Network (RPN)
part of the model. If 2, the function will perform box refinement and
other auxiliary predictions all in the second stage. If 3, it will
extract features from refined boxes and perform the auxiliary
predictions on the non-maximum suppressed refined boxes.
If is_training is true and the value of number_of_stages is 3, it is
reduced to 2 since all the model heads are trained in parallel in second
stage during training.
first_stage_anchor_generator: An anchor_generator.AnchorGenerator object
(note that currently we only support
grid_anchor_generator.GridAnchorGenerator objects)
first_stage_target_assigner: Target assigner to use for first stage of
Faster R-CNN (RPN).
first_stage_atrous_rate: A single integer indicating the atrous rate for
the single convolution op which is applied to the `rpn_features_to_crop`
tensor to obtain a tensor to be used for box prediction. Some feature
extractors optionally allow for producing feature maps computed at
denser resolutions. The atrous rate is used to compensate for the
denser feature maps by using an effectively larger receptive field.
(This should typically be set to 1).
first_stage_box_predictor_arg_scope_fn: Either a
Keras layer hyperparams object or a function to construct tf-slim
arg_scope for conv2d, separable_conv2d and fully_connected ops. Used
for the RPN box predictor. If it is a keras hyperparams object the
RPN box predictor will be a Keras model. If it is a function to
construct an arg scope it will be a tf-slim box predictor.
first_stage_box_predictor_kernel_size: Kernel size to use for the
convolution op just prior to RPN box predictions.
first_stage_box_predictor_depth: Output depth for the convolution op
just prior to RPN box predictions.
first_stage_minibatch_size: The "batch size" to use for computing the
objectness and location loss of the region proposal network. This
"batch size" refers to the number of anchors selected as contributing
to the loss function for any given image within the image batch and is
only called "batch_size" due to terminology from the Faster R-CNN paper.
first_stage_sampler: Sampler to use for first stage loss (RPN loss).
first_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
callable that takes `boxes`, `scores` and optional `clip_window`(with
all other inputs already set) and returns a dictionary containing
tensors with keys: `detection_boxes`, `detection_scores`,
`detection_classes`, `num_detections`. This is used to perform non max
suppression on the boxes predicted by the Region Proposal Network
(RPN).
See `post_processing.batch_multiclass_non_max_suppression` for the type
and shape of these tensors.
first_stage_max_proposals: Maximum number of boxes to retain after
performing Non-Max Suppression (NMS) on the boxes predicted by the
Region Proposal Network (RPN).
first_stage_localization_loss_weight: A float
first_stage_objectness_loss_weight: A float
crop_and_resize_fn: A differentiable resampler to use for cropping RPN
proposal features.
initial_crop_size: A single integer indicating the output size
(width and height are set to be the same) of the initial bilinear
interpolation based cropping during ROI pooling.
maxpool_kernel_size: A single integer indicating the kernel size of the
max pool op on the cropped feature map during ROI pooling.
maxpool_stride: A single integer indicating the stride of the max pool
op on the cropped feature map during ROI pooling.
second_stage_target_assigner: Target assigner to use for second stage of
Faster R-CNN. If the model is configured with multiple prediction heads,
this target assigner is used to generate targets for all heads (with the
correct `unmatched_class_label`).
second_stage_mask_rcnn_box_predictor: Mask R-CNN box predictor to use for
the second stage.
second_stage_batch_size: The batch size used for computing the
classification and refined location loss of the box classifier. This
"batch size" refers to the number of proposals selected as contributing
to the loss function for any given image within the image batch and is
only called "batch_size" due to terminology from the Faster R-CNN paper.
second_stage_sampler: Sampler to use for second stage loss (box
classifier loss).
second_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
callable that takes `boxes`, `scores`, optional `clip_window` and
optional (kwarg) `mask` inputs (with all other inputs already set)
and returns a dictionary containing tensors with keys:
`detection_boxes`, `detection_scores`, `detection_classes`,
`num_detections`, and (optionally) `detection_masks`. See
`post_processing.batch_multiclass_non_max_suppression` for the type and
shape of these tensors.
second_stage_score_conversion_fn: Callable elementwise nonlinearity
(that takes tensors as inputs and returns tensors). This is usually
used to convert logits to probabilities.
second_stage_localization_loss_weight: A float indicating the scale factor
for second stage localization loss.
second_stage_classification_loss_weight: A float indicating the scale
factor for second stage classification loss.
second_stage_classification_loss: Classification loss used by the second
stage classifier. Either losses.WeightedSigmoidClassificationLoss or
losses.WeightedSoftmaxClassificationLoss.
second_stage_mask_prediction_loss_weight: A float indicating the scale
factor for second stage mask prediction loss. This is applicable only if
second stage box predictor is configured to predict masks.
hard_example_miner: A losses.HardExampleMiner object (can be None).
parallel_iterations: (Optional) The number of iterations allowed to run
in parallel for calls to tf.map_fn.
add_summaries: boolean (default: True) controlling whether summary ops
should be added to tensorflow graph.
clip_anchors_to_image: Normally, anchors generated for a given image size
are pruned during training if they lie outside the image window. This
option clips the anchors to be within the image instead of pruning.
use_static_shapes: If True, uses implementation of ops with static shape
guarantees.
resize_masks: Indicates whether the masks presend in the groundtruth
should be resized in the model with `image_resizer_fn`
freeze_batchnorm: Whether to freeze batch norm parameters in the first
stage box predictor during training or not. When training with a small
batch size (e.g. 1), it is desirable to freeze batch norm update and
use pretrained batch norm params.
return_raw_detections_during_predict: Whether to return raw detection
boxes in the predict() method. These are decoded boxes that have not
been through postprocessing (i.e. NMS). Default False.
output_final_box_features: Whether to output final box features. If true,
it crops the feauture map based on the final box prediction and returns
in the dict as detection_features.
attention_bottleneck_dimension: A single integer. The bottleneck feature
dimension of the attention block.
attention_temperature: A single float. The attention temperature.
Raises:
ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at
training time.
ValueError: If first_stage_anchor_generator is not of type
grid_anchor_generator.GridAnchorGenerator.
"""
super(ContextRCNNMetaArch, self).__init__(
is_training,
num_classes,
image_resizer_fn,
feature_extractor,
number_of_stages,
first_stage_anchor_generator,
first_stage_target_assigner,
first_stage_atrous_rate,
first_stage_box_predictor_arg_scope_fn,
first_stage_box_predictor_kernel_size,
first_stage_box_predictor_depth,
first_stage_minibatch_size,
first_stage_sampler,
first_stage_non_max_suppression_fn,
first_stage_max_proposals,
first_stage_localization_loss_weight,
first_stage_objectness_loss_weight,
crop_and_resize_fn,
initial_crop_size,
maxpool_kernel_size,
maxpool_stride,
second_stage_target_assigner,
second_stage_mask_rcnn_box_predictor,
second_stage_batch_size,
second_stage_sampler,
second_stage_non_max_suppression_fn,
second_stage_score_conversion_fn,
second_stage_localization_loss_weight,
second_stage_classification_loss_weight,
second_stage_classification_loss,
second_stage_mask_prediction_loss_weight=(
second_stage_mask_prediction_loss_weight),
hard_example_miner=hard_example_miner,
parallel_iterations=parallel_iterations,
add_summaries=add_summaries,
clip_anchors_to_image=clip_anchors_to_image,
use_static_shapes=use_static_shapes,
resize_masks=resize_masks,
freeze_batchnorm=freeze_batchnorm,
return_raw_detections_during_predict=(
return_raw_detections_during_predict),
output_final_box_features=output_final_box_features)
self._context_feature_extract_fn = functools.partial(
context_rcnn_lib.compute_box_context_attention,
bottleneck_dimension=attention_bottleneck_dimension,
attention_temperature=attention_temperature,
is_training=is_training)
@staticmethod
def get_side_inputs(features):
"""Overrides the get_side_inputs function in the base class.
This function returns context_features and valid_context_size, which will be
used in the _compute_second_stage_input_feature_maps function.
Args:
features: A dictionary of tensors.
Returns:
A dictionary of tensors contains context_features and valid_context_size.
Raises:
ValueError: If context_features or valid_context_size is not in the
features.
"""
if (fields.InputDataFields.context_features not in features or
fields.InputDataFields.valid_context_size not in features):
raise ValueError(
"Please make sure context_features and valid_context_size are in the "
"features")
return {
fields.InputDataFields.context_features:
features[fields.InputDataFields.context_features],
fields.InputDataFields.valid_context_size:
features[fields.InputDataFields.valid_context_size]
}
def _compute_second_stage_input_feature_maps(self, features_to_crop,
proposal_boxes_normalized,
context_features,
valid_context_size):
"""Crops to a set of proposals from the feature map for a batch of images.
This function overrides the one in the FasterRCNNMetaArch. Aside from
cropping and resizing the feature maps, which is done in the parent class,
it adds context attention features to the box features.
Args:
features_to_crop: A float32 Tensor with shape [batch_size, height, width,
depth]
proposal_boxes_normalized: A float32 Tensor with shape [batch_size,
num_proposals, box_code_size] containing proposal boxes in normalized
coordinates.
context_features: A float Tensor of shape [batch_size, context_size,
num_context_features].
valid_context_size: A int32 Tensor of shape [batch_size].
Returns:
A float32 Tensor with shape [K, new_height, new_width, depth].
"""
box_features = self._crop_and_resize_fn(
features_to_crop, proposal_boxes_normalized,
[self._initial_crop_size, self._initial_crop_size])
attention_features = self._context_feature_extract_fn(
box_features=box_features,
context_features=context_features,
valid_context_size=valid_context_size)
# Adds box features with attention features.
box_features += attention_features
flattened_feature_maps = self._flatten_first_two_dimensions(box_features)
return self._maxpool_layer(flattened_feature_maps)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,14 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for google3.third_party.tensorflow_models.object_detection.meta_architectures.context_meta_arch."""
"""Tests for object_detection.meta_architectures.context_meta_arch."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import unittest
from absl.testing import parameterized
import mock
import tensorflow.compat.v1 as tf
......@@ -109,6 +109,7 @@ class FakeFasterRCNNKerasFeatureExtractor(
])
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class ContextRCNNMetaArchTest(test_case.TestCase, parameterized.TestCase):
def _get_model(self, box_predictor, **common_kwargs):
......
......@@ -18,9 +18,11 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import tensorflow.compat.v1 as tf
from object_detection.core import standard_fields
from object_detection.metrics import calibration_evaluation
from object_detection.utils import tf_version
def _get_categories_list():
......@@ -36,6 +38,7 @@ def _get_categories_list():
}]
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class CalibrationDetectionEvaluationTest(tf.test.TestCase):
def _get_ece(self, ece_op, update_op):
......
......@@ -18,11 +18,14 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import numpy as np
import tensorflow.compat.v1 as tf
from object_detection.metrics import calibration_metrics
from object_detection.utils import tf_version
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class CalibrationLibTest(tf.test.TestCase):
@staticmethod
......
......@@ -24,6 +24,7 @@ import tensorflow.compat.v1 as tf
from object_detection.core import standard_fields
from object_detection.metrics import coco_tools
from object_detection.utils import json_utils
from object_detection.utils import np_mask_ops
from object_detection.utils import object_detection_evaluation
......@@ -1263,3 +1264,535 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
eval_metric_ops[metric_name] = (tf.py_func(
value_func_factory(metric_name), [], np.float32), update_op)
return eval_metric_ops
class CocoPanopticSegmentationEvaluator(
object_detection_evaluation.DetectionEvaluator):
"""Class to evaluate PQ (panoptic quality) metric on COCO dataset.
More details about this metric: https://arxiv.org/pdf/1801.00868.pdf.
"""
def __init__(self,
categories,
include_metrics_per_category=False,
iou_threshold=0.5,
ioa_threshold=0.5):
"""Constructor.
Args:
categories: A list of dicts, each of which has the following keys -
'id': (required) an integer id uniquely identifying this category.
'name': (required) string representing category name e.g., 'cat', 'dog'.
include_metrics_per_category: If True, include metrics for each category.
iou_threshold: intersection-over-union threshold for mask matching (with
normal groundtruths).
ioa_threshold: intersection-over-area threshold for mask matching with
"is_crowd" groundtruths.
"""
super(CocoPanopticSegmentationEvaluator, self).__init__(categories)
self._groundtruth_masks = {}
self._groundtruth_class_labels = {}
self._groundtruth_is_crowd = {}
self._predicted_masks = {}
self._predicted_class_labels = {}
self._include_metrics_per_category = include_metrics_per_category
self._iou_threshold = iou_threshold
self._ioa_threshold = ioa_threshold
def clear(self):
"""Clears the state to prepare for a fresh evaluation."""
self._groundtruth_masks.clear()
self._groundtruth_class_labels.clear()
self._groundtruth_is_crowd.clear()
self._predicted_masks.clear()
self._predicted_class_labels.clear()
def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
"""Adds groundtruth for a single image to be used for evaluation.
If the image has already been added, a warning is logged, and groundtruth is
ignored.
Args:
image_id: A unique string/integer identifier for the image.
groundtruth_dict: A dictionary containing -
InputDataFields.groundtruth_classes: integer numpy array of shape
[num_masks] containing 1-indexed groundtruth classes for the mask.
InputDataFields.groundtruth_instance_masks: uint8 numpy array of shape
[num_masks, image_height, image_width] containing groundtruth masks.
The elements of the array must be in {0, 1}.
InputDataFields.groundtruth_is_crowd (optional): integer numpy array of
shape [num_boxes] containing iscrowd flag for groundtruth boxes.
"""
if image_id in self._groundtruth_masks:
tf.logging.warning(
'Ignoring groundtruth with image %s, since it has already been '
'added to the ground truth database.', image_id)
return
self._groundtruth_masks[image_id] = groundtruth_dict[
standard_fields.InputDataFields.groundtruth_instance_masks]
self._groundtruth_class_labels[image_id] = groundtruth_dict[
standard_fields.InputDataFields.groundtruth_classes]
groundtruth_is_crowd = groundtruth_dict.get(
standard_fields.InputDataFields.groundtruth_is_crowd)
# Drop groundtruth_is_crowd if empty tensor.
if groundtruth_is_crowd is not None and not groundtruth_is_crowd.size > 0:
groundtruth_is_crowd = None
if groundtruth_is_crowd is not None:
self._groundtruth_is_crowd[image_id] = groundtruth_is_crowd
def add_single_detected_image_info(self, image_id, detections_dict):
"""Adds detections for a single image to be used for evaluation.
If a detection has already been added for this image id, a warning is
logged, and the detection is skipped.
Args:
image_id: A unique string/integer identifier for the image.
detections_dict: A dictionary containing -
DetectionResultFields.detection_classes: integer numpy array of shape
[num_masks] containing 1-indexed detection classes for the masks.
DetectionResultFields.detection_masks: optional uint8 numpy array of
shape [num_masks, image_height, image_width] containing instance
masks. The elements of the array must be in {0, 1}.
Raises:
ValueError: If results and groundtruth shape don't match.
"""
if image_id not in self._groundtruth_masks:
raise ValueError('Missing groundtruth for image id: {}'.format(image_id))
detection_masks = detections_dict[
standard_fields.DetectionResultFields.detection_masks]
self._predicted_masks[image_id] = detection_masks
self._predicted_class_labels[image_id] = detections_dict[
standard_fields.DetectionResultFields.detection_classes]
groundtruth_mask_shape = self._groundtruth_masks[image_id].shape
if groundtruth_mask_shape[1:] != detection_masks.shape[1:]:
raise ValueError("The shape of results doesn't match groundtruth.")
def evaluate(self):
"""Evaluates the detection masks and returns a dictionary of coco metrics.
Returns:
A dictionary holding -
1. summary_metric:
'PanopticQuality@%.2fIOU': mean panoptic quality averaged over classes at
the required IOU.
'SegmentationQuality@%.2fIOU': mean segmentation quality averaged over
classes at the required IOU.
'RecognitionQuality@%.2fIOU': mean recognition quality averaged over
classes at the required IOU.
'NumValidClasses': number of valid classes. A valid class should have at
least one normal (is_crowd=0) groundtruth mask or one predicted mask.
'NumTotalClasses': number of total classes.
2. per_category_pq: if include_metrics_per_category is True, category
specific results with keys of the form:
'PanopticQuality@%.2fIOU_ByCategory/category'.
"""
# Evaluate and accumulate the iou/tp/fp/fn.
sum_tp_iou, sum_num_tp, sum_num_fp, sum_num_fn = self._evaluate_all_masks()
# Compute PQ metric for each category and average over all classes.
mask_metrics = self._compute_panoptic_metrics(sum_tp_iou, sum_num_tp,
sum_num_fp, sum_num_fn)
return mask_metrics
def get_estimator_eval_metric_ops(self, eval_dict):
"""Returns a dictionary of eval metric ops.
Note that once value_op is called, the detections and groundtruth added via
update_op are cleared.
Args:
eval_dict: A dictionary that holds tensors for evaluating object detection
performance. For single-image evaluation, this dictionary may be
produced from eval_util.result_dict_for_single_example(). If multi-image
evaluation, `eval_dict` should contain the fields
'num_gt_masks_per_image' and 'num_det_masks_per_image' to properly unpad
the tensors from the batch.
Returns:
a dictionary of metric names to tuple of value_op and update_op that can
be used as eval metric ops in tf.estimator.EstimatorSpec. Note that all
update ops must be run together and similarly all value ops must be run
together to guarantee correct behaviour.
"""
def update_op(image_id_batched, groundtruth_classes_batched,
groundtruth_instance_masks_batched,
groundtruth_is_crowd_batched, num_gt_masks_per_image,
detection_classes_batched, detection_masks_batched,
num_det_masks_per_image):
"""Update op for metrics."""
for (image_id, groundtruth_classes, groundtruth_instance_masks,
groundtruth_is_crowd, num_gt_mask, detection_classes,
detection_masks, num_det_mask) in zip(
image_id_batched, groundtruth_classes_batched,
groundtruth_instance_masks_batched, groundtruth_is_crowd_batched,
num_gt_masks_per_image, detection_classes_batched,
detection_masks_batched, num_det_masks_per_image):
self.add_single_ground_truth_image_info(
image_id, {
'groundtruth_classes':
groundtruth_classes[:num_gt_mask],
'groundtruth_instance_masks':
groundtruth_instance_masks[:num_gt_mask],
'groundtruth_is_crowd':
groundtruth_is_crowd[:num_gt_mask]
})
self.add_single_detected_image_info(
image_id, {
'detection_classes': detection_classes[:num_det_mask],
'detection_masks': detection_masks[:num_det_mask]
})
# Unpack items from the evaluation dictionary.
(image_id, groundtruth_classes, groundtruth_instance_masks,
groundtruth_is_crowd, num_gt_masks_per_image, detection_classes,
detection_masks, num_det_masks_per_image
) = self._unpack_evaluation_dictionary_items(eval_dict)
update_op = tf.py_func(update_op, [
image_id, groundtruth_classes, groundtruth_instance_masks,
groundtruth_is_crowd, num_gt_masks_per_image, detection_classes,
detection_masks, num_det_masks_per_image
], [])
metric_names = [
'PanopticQuality@%.2fIOU' % self._iou_threshold,
'SegmentationQuality@%.2fIOU' % self._iou_threshold,
'RecognitionQuality@%.2fIOU' % self._iou_threshold
]
if self._include_metrics_per_category:
for category_dict in self._categories:
metric_names.append('PanopticQuality@%.2fIOU_ByCategory/%s' %
(self._iou_threshold, category_dict['name']))
def first_value_func():
self._metrics = self.evaluate()
self.clear()
return np.float32(self._metrics[metric_names[0]])
def value_func_factory(metric_name):
def value_func():
return np.float32(self._metrics[metric_name])
return value_func
# Ensure that the metrics are only evaluated once.
first_value_op = tf.py_func(first_value_func, [], tf.float32)
eval_metric_ops = {metric_names[0]: (first_value_op, update_op)}
with tf.control_dependencies([first_value_op]):
for metric_name in metric_names[1:]:
eval_metric_ops[metric_name] = (tf.py_func(
value_func_factory(metric_name), [], np.float32), update_op)
return eval_metric_ops
def _evaluate_all_masks(self):
"""Evaluate all masks and compute sum iou/TP/FP/FN."""
sum_num_tp = {category['id']: 0 for category in self._categories}
sum_num_fp = sum_num_tp.copy()
sum_num_fn = sum_num_tp.copy()
sum_tp_iou = sum_num_tp.copy()
for image_id in self._groundtruth_class_labels:
# Separate normal and is_crowd groundtruth
crowd_gt_indices = self._groundtruth_is_crowd.get(image_id)
(normal_gt_masks, normal_gt_classes, crowd_gt_masks,
crowd_gt_classes) = self._separate_normal_and_crowd_labels(
crowd_gt_indices, self._groundtruth_masks[image_id],
self._groundtruth_class_labels[image_id])
# Mask matching to normal GT.
predicted_masks = self._predicted_masks[image_id]
predicted_class_labels = self._predicted_class_labels[image_id]
(overlaps, pred_matched,
gt_matched) = self._match_predictions_to_groundtruths(
predicted_masks,
predicted_class_labels,
normal_gt_masks,
normal_gt_classes,
self._iou_threshold,
is_crowd=False,
with_replacement=False)
# Accumulate true positives.
for (class_id, is_matched, overlap) in zip(predicted_class_labels,
pred_matched, overlaps):
if is_matched:
sum_num_tp[class_id] += 1
sum_tp_iou[class_id] += overlap
# Accumulate false negatives.
for (class_id, is_matched) in zip(normal_gt_classes, gt_matched):
if not is_matched:
sum_num_fn[class_id] += 1
# Match remaining predictions to crowd gt.
remained_pred_indices = np.logical_not(pred_matched)
remained_pred_masks = predicted_masks[remained_pred_indices, :, :]
remained_pred_classes = predicted_class_labels[remained_pred_indices]
_, pred_matched, _ = self._match_predictions_to_groundtruths(
remained_pred_masks,
remained_pred_classes,
crowd_gt_masks,
crowd_gt_classes,
self._ioa_threshold,
is_crowd=True,
with_replacement=True)
# Accumulate false positives
for (class_id, is_matched) in zip(remained_pred_classes, pred_matched):
if not is_matched:
sum_num_fp[class_id] += 1
return sum_tp_iou, sum_num_tp, sum_num_fp, sum_num_fn
def _compute_panoptic_metrics(self, sum_tp_iou, sum_num_tp, sum_num_fp,
sum_num_fn):
"""Compute PQ metric for each category and average over all classes.
Args:
sum_tp_iou: dict, summed true positive intersection-over-union (IoU) for
each class, keyed by class_id.
sum_num_tp: the total number of true positives for each class, keyed by
class_id.
sum_num_fp: the total number of false positives for each class, keyed by
class_id.
sum_num_fn: the total number of false negatives for each class, keyed by
class_id.
Returns:
mask_metrics: a dictionary containing averaged metrics over all classes,
and per-category metrics if required.
"""
mask_metrics = {}
sum_pq = 0
sum_sq = 0
sum_rq = 0
num_valid_classes = 0
for category in self._categories:
class_id = category['id']
(panoptic_quality, segmentation_quality,
recognition_quality) = self._compute_panoptic_metrics_single_class(
sum_tp_iou[class_id], sum_num_tp[class_id], sum_num_fp[class_id],
sum_num_fn[class_id])
if panoptic_quality is not None:
sum_pq += panoptic_quality
sum_sq += segmentation_quality
sum_rq += recognition_quality
num_valid_classes += 1
if self._include_metrics_per_category:
mask_metrics['PanopticQuality@%.2fIOU_ByCategory/%s' %
(self._iou_threshold,
category['name'])] = panoptic_quality
mask_metrics['PanopticQuality@%.2fIOU' %
self._iou_threshold] = sum_pq / num_valid_classes
mask_metrics['SegmentationQuality@%.2fIOU' %
self._iou_threshold] = sum_sq / num_valid_classes
mask_metrics['RecognitionQuality@%.2fIOU' %
self._iou_threshold] = sum_rq / num_valid_classes
mask_metrics['NumValidClasses'] = num_valid_classes
mask_metrics['NumTotalClasses'] = len(self._categories)
return mask_metrics
def _compute_panoptic_metrics_single_class(self, sum_tp_iou, num_tp, num_fp,
num_fn):
"""Compute panoptic metrics: panoptic/segmentation/recognition quality.
More computation details in https://arxiv.org/pdf/1801.00868.pdf.
Args:
sum_tp_iou: summed true positive intersection-over-union (IoU) for a
specific class.
num_tp: the total number of true positives for a specific class.
num_fp: the total number of false positives for a specific class.
num_fn: the total number of false negatives for a specific class.
Returns:
panoptic_quality: sum_tp_iou / (num_tp + 0.5*num_fp + 0.5*num_fn).
segmentation_quality: sum_tp_iou / num_tp.
recognition_quality: num_tp / (num_tp + 0.5*num_fp + 0.5*num_fn).
"""
denominator = num_tp + 0.5 * num_fp + 0.5 * num_fn
# Calculate metric only if there is at least one GT or one prediction.
if denominator > 0:
recognition_quality = num_tp / denominator
if num_tp > 0:
segmentation_quality = sum_tp_iou / num_tp
else:
# If there is no TP for this category.
segmentation_quality = 0
panoptic_quality = segmentation_quality * recognition_quality
return panoptic_quality, segmentation_quality, recognition_quality
else:
return None, None, None
def _separate_normal_and_crowd_labels(self, crowd_gt_indices,
groundtruth_masks, groundtruth_classes):
"""Separate normal and crowd groundtruth class_labels and masks.
Args:
crowd_gt_indices: None or array of shape [num_groundtruths]. If None, all
groundtruths are treated as normal ones.
groundtruth_masks: array of shape [num_groundtruths, height, width].
groundtruth_classes: array of shape [num_groundtruths].
Returns:
normal_gt_masks: array of shape [num_normal_groundtruths, height, width].
normal_gt_classes: array of shape [num_normal_groundtruths].
crowd_gt_masks: array of shape [num_crowd_groundtruths, height, width].
crowd_gt_classes: array of shape [num_crowd_groundtruths].
Raises:
ValueError: if the shape of groundtruth classes doesn't match groundtruth
masks or if the shape of crowd_gt_indices.
"""
if groundtruth_masks.shape[0] != groundtruth_classes.shape[0]:
raise ValueError(
"The number of masks doesn't match the number of labels.")
if crowd_gt_indices is None:
# All gts are treated as normal
crowd_gt_indices = np.zeros(groundtruth_masks.shape, dtype=np.bool)
else:
if groundtruth_masks.shape[0] != crowd_gt_indices.shape[0]:
raise ValueError(
"The number of masks doesn't match the number of is_crowd labels.")
crowd_gt_indices = crowd_gt_indices.astype(np.bool)
normal_gt_indices = np.logical_not(crowd_gt_indices)
if normal_gt_indices.size:
normal_gt_masks = groundtruth_masks[normal_gt_indices, :, :]
normal_gt_classes = groundtruth_classes[normal_gt_indices]
crowd_gt_masks = groundtruth_masks[crowd_gt_indices, :, :]
crowd_gt_classes = groundtruth_classes[crowd_gt_indices]
else:
# No groundtruths available, groundtruth_masks.shape = (0, h, w)
normal_gt_masks = groundtruth_masks
normal_gt_classes = groundtruth_classes
crowd_gt_masks = groundtruth_masks
crowd_gt_classes = groundtruth_classes
return normal_gt_masks, normal_gt_classes, crowd_gt_masks, crowd_gt_classes
def _match_predictions_to_groundtruths(self,
predicted_masks,
predicted_classes,
groundtruth_masks,
groundtruth_classes,
matching_threshold,
is_crowd=False,
with_replacement=False):
"""Match the predicted masks to groundtruths.
Args:
predicted_masks: array of shape [num_predictions, height, width].
predicted_classes: array of shape [num_predictions].
groundtruth_masks: array of shape [num_groundtruths, height, width].
groundtruth_classes: array of shape [num_groundtruths].
matching_threshold: if the overlap between a prediction and a groundtruth
is larger than this threshold, the prediction is true positive.
is_crowd: whether the groundtruths are crowd annotation or not. If True,
use intersection over area (IoA) as the overlapping metric; otherwise
use intersection over union (IoU).
with_replacement: whether a groundtruth can be matched to multiple
predictions. By default, for normal groundtruths, only 1-1 matching is
allowed for normal groundtruths; for crowd groundtruths, 1-to-many must
be allowed.
Returns:
best_overlaps: array of shape [num_predictions]. Values representing the
IoU
or IoA with best matched groundtruth.
pred_matched: array of shape [num_predictions]. Boolean value representing
whether the ith prediction is matched to a groundtruth.
gt_matched: array of shape [num_groundtruth]. Boolean value representing
whether the ith groundtruth is matched to a prediction.
Raises:
ValueError: if the shape of groundtruth/predicted masks doesn't match
groundtruth/predicted classes.
"""
if groundtruth_masks.shape[0] != groundtruth_classes.shape[0]:
raise ValueError(
"The number of GT masks doesn't match the number of labels.")
if predicted_masks.shape[0] != predicted_classes.shape[0]:
raise ValueError(
"The number of predicted masks doesn't match the number of labels.")
gt_matched = np.zeros(groundtruth_classes.shape, dtype=np.bool)
pred_matched = np.zeros(predicted_classes.shape, dtype=np.bool)
best_overlaps = np.zeros(predicted_classes.shape)
for pid in range(predicted_classes.shape[0]):
best_overlap = 0
matched_gt_id = -1
for gid in range(groundtruth_classes.shape[0]):
if predicted_classes[pid] == groundtruth_classes[gid]:
if (not with_replacement) and gt_matched[gid]:
continue
if not is_crowd:
overlap = np_mask_ops.iou(predicted_masks[pid:pid + 1],
groundtruth_masks[gid:gid + 1])[0, 0]
else:
overlap = np_mask_ops.ioa(groundtruth_masks[gid:gid + 1],
predicted_masks[pid:pid + 1])[0, 0]
if overlap >= matching_threshold and overlap > best_overlap:
matched_gt_id = gid
best_overlap = overlap
if matched_gt_id >= 0:
gt_matched[matched_gt_id] = True
pred_matched[pid] = True
best_overlaps[pid] = best_overlap
return best_overlaps, pred_matched, gt_matched
def _unpack_evaluation_dictionary_items(self, eval_dict):
"""Unpack items from the evaluation dictionary."""
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
image_id = eval_dict[input_data_fields.key]
groundtruth_classes = eval_dict[input_data_fields.groundtruth_classes]
groundtruth_instance_masks = eval_dict[
input_data_fields.groundtruth_instance_masks]
groundtruth_is_crowd = eval_dict.get(input_data_fields.groundtruth_is_crowd,
None)
num_gt_masks_per_image = eval_dict.get(
input_data_fields.num_groundtruth_boxes, None)
detection_classes = eval_dict[detection_fields.detection_classes]
detection_masks = eval_dict[detection_fields.detection_masks]
num_det_masks_per_image = eval_dict.get(detection_fields.num_detections,
None)
if groundtruth_is_crowd is None:
groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
if not image_id.shape.as_list():
# Apply a batch dimension to all tensors.
image_id = tf.expand_dims(image_id, 0)
groundtruth_classes = tf.expand_dims(groundtruth_classes, 0)
groundtruth_instance_masks = tf.expand_dims(groundtruth_instance_masks, 0)
groundtruth_is_crowd = tf.expand_dims(groundtruth_is_crowd, 0)
detection_classes = tf.expand_dims(detection_classes, 0)
detection_masks = tf.expand_dims(detection_masks, 0)
if num_gt_masks_per_image is None:
num_gt_masks_per_image = tf.shape(groundtruth_classes)[1:2]
else:
num_gt_masks_per_image = tf.expand_dims(num_gt_masks_per_image, 0)
if num_det_masks_per_image is None:
num_det_masks_per_image = tf.shape(detection_classes)[1:2]
else:
num_det_masks_per_image = tf.expand_dims(num_det_masks_per_image, 0)
else:
if num_gt_masks_per_image is None:
num_gt_masks_per_image = tf.tile(
tf.shape(groundtruth_classes)[1:2],
multiples=tf.shape(groundtruth_classes)[0:1])
if num_det_masks_per_image is None:
num_det_masks_per_image = tf.tile(
tf.shape(detection_classes)[1:2],
multiples=tf.shape(detection_classes)[0:1])
return (image_id, groundtruth_classes, groundtruth_instance_masks,
groundtruth_is_crowd, num_gt_masks_per_image, detection_classes,
detection_masks, num_det_masks_per_image)
......@@ -18,10 +18,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import numpy as np
import tensorflow.compat.v1 as tf
from object_detection.core import standard_fields
from object_detection.metrics import coco_evaluation
from object_detection.utils import tf_version
def _get_categories_list():
......@@ -250,6 +252,7 @@ class CocoDetectionEvaluationTest(tf.test.TestCase):
})
@unittest.skipIf(tf_version.is_tf2(), 'Only Supported in TF1.X')
class CocoEvaluationPyFuncTest(tf.test.TestCase):
def testGetOneMAPWithMatchingGroundtruthAndDetections(self):
......@@ -926,6 +929,7 @@ class CocoKeypointEvaluationTest(tf.test.TestCase):
-1.0)
@unittest.skipIf(tf_version.is_tf2(), 'Only Supported in TF1.X')
class CocoKeypointEvaluationPyFuncTest(tf.test.TestCase):
def testGetOneMAPWithMatchingKeypoints(self):
......@@ -1438,6 +1442,7 @@ class CocoMaskEvaluationTest(tf.test.TestCase):
self.assertFalse(coco_evaluator._detection_masks_list)
@unittest.skipIf(tf_version.is_tf2(), 'Only Supported in TF1.X')
class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
def testAddEvalDict(self):
......@@ -1716,5 +1721,221 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
self.assertFalse(coco_evaluator._detection_masks_list)
def _get_panoptic_test_data():
# image1 contains 3 people in gt, (2 normal annotation and 1 "is_crowd"
# annotation), and 3 people in prediction.
gt_masks1 = np.zeros((3, 50, 50), dtype=np.uint8)
result_masks1 = np.zeros((3, 50, 50), dtype=np.uint8)
gt_masks1[0, 10:20, 20:30] = 1
result_masks1[0, 10:18, 20:30] = 1
gt_masks1[1, 25:30, 25:35] = 1
result_masks1[1, 18:25, 25:30] = 1
gt_masks1[2, 40:50, 40:50] = 1
result_masks1[2, 47:50, 47:50] = 1
gt_class1 = np.array([1, 1, 1])
gt_is_crowd1 = np.array([0, 0, 1])
result_class1 = np.array([1, 1, 1])
# image2 contains 1 dog and 1 cat in gt, while 1 person and 1 dog in
# prediction.
gt_masks2 = np.zeros((2, 30, 40), dtype=np.uint8)
result_masks2 = np.zeros((2, 30, 40), dtype=np.uint8)
gt_masks2[0, 5:15, 20:35] = 1
gt_masks2[1, 20:30, 0:10] = 1
result_masks2[0, 20:25, 10:15] = 1
result_masks2[1, 6:15, 15:35] = 1
gt_class2 = np.array([2, 3])
gt_is_crowd2 = np.array([0, 0])
result_class2 = np.array([1, 2])
gt_class = [gt_class1, gt_class2]
gt_masks = [gt_masks1, gt_masks2]
gt_is_crowd = [gt_is_crowd1, gt_is_crowd2]
result_class = [result_class1, result_class2]
result_masks = [result_masks1, result_masks2]
return gt_class, gt_masks, gt_is_crowd, result_class, result_masks
class CocoPanopticEvaluationTest(tf.test.TestCase):
def test_panoptic_quality(self):
pq_evaluator = coco_evaluation.CocoPanopticSegmentationEvaluator(
_get_categories_list(), include_metrics_per_category=True)
(gt_class, gt_masks, gt_is_crowd, result_class,
result_masks) = _get_panoptic_test_data()
for i in range(2):
pq_evaluator.add_single_ground_truth_image_info(
image_id='image%d' % i,
groundtruth_dict={
standard_fields.InputDataFields.groundtruth_classes:
gt_class[i],
standard_fields.InputDataFields.groundtruth_instance_masks:
gt_masks[i],
standard_fields.InputDataFields.groundtruth_is_crowd:
gt_is_crowd[i]
})
pq_evaluator.add_single_detected_image_info(
image_id='image%d' % i,
detections_dict={
standard_fields.DetectionResultFields.detection_classes:
result_class[i],
standard_fields.DetectionResultFields.detection_masks:
result_masks[i]
})
metrics = pq_evaluator.evaluate()
self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU_ByCategory/person'],
0.32)
self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU_ByCategory/dog'],
135.0 / 195)
self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU_ByCategory/cat'], 0)
self.assertAlmostEqual(metrics['SegmentationQuality@0.50IOU'],
(0.8 + 135.0 / 195) / 3)
self.assertAlmostEqual(metrics['RecognitionQuality@0.50IOU'], (0.4 + 1) / 3)
self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU'],
(0.32 + 135.0 / 195) / 3)
self.assertEqual(metrics['NumValidClasses'], 3)
self.assertEqual(metrics['NumTotalClasses'], 3)
@unittest.skipIf(tf_version.is_tf2(), 'Only Supported in TF1.X')
class CocoPanopticEvaluationPyFuncTest(tf.test.TestCase):
def testPanopticQualityNoBatch(self):
pq_evaluator = coco_evaluation.CocoPanopticSegmentationEvaluator(
_get_categories_list(), include_metrics_per_category=True)
image_id = tf.placeholder(tf.string, shape=())
groundtruth_classes = tf.placeholder(tf.int32, shape=(None))
groundtruth_masks = tf.placeholder(tf.uint8, shape=(None, None, None))
groundtruth_is_crowd = tf.placeholder(tf.int32, shape=(None))
detection_classes = tf.placeholder(tf.int32, shape=(None))
detection_masks = tf.placeholder(tf.uint8, shape=(None, None, None))
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
eval_dict = {
input_data_fields.key: image_id,
input_data_fields.groundtruth_classes: groundtruth_classes,
input_data_fields.groundtruth_instance_masks: groundtruth_masks,
input_data_fields.groundtruth_is_crowd: groundtruth_is_crowd,
detection_fields.detection_classes: detection_classes,
detection_fields.detection_masks: detection_masks,
}
eval_metric_ops = pq_evaluator.get_estimator_eval_metric_ops(eval_dict)
_, update_op = eval_metric_ops['PanopticQuality@0.50IOU']
(gt_class, gt_masks, gt_is_crowd, result_class,
result_masks) = _get_panoptic_test_data()
with self.test_session() as sess:
for i in range(2):
sess.run(
update_op,
feed_dict={
image_id: 'image%d' % i,
groundtruth_classes: gt_class[i],
groundtruth_masks: gt_masks[i],
groundtruth_is_crowd: gt_is_crowd[i],
detection_classes: result_class[i],
detection_masks: result_masks[i]
})
metrics = {}
for key, (value_op, _) in eval_metric_ops.items():
metrics[key] = value_op
metrics = sess.run(metrics)
self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU'],
(0.32 + 135.0 / 195) / 3)
def testPanopticQualityBatched(self):
pq_evaluator = coco_evaluation.CocoPanopticSegmentationEvaluator(
_get_categories_list(), include_metrics_per_category=True)
batch_size = 2
image_id = tf.placeholder(tf.string, shape=(batch_size))
groundtruth_classes = tf.placeholder(tf.int32, shape=(batch_size, None))
groundtruth_masks = tf.placeholder(
tf.uint8, shape=(batch_size, None, None, None))
groundtruth_is_crowd = tf.placeholder(tf.int32, shape=(batch_size, None))
detection_classes = tf.placeholder(tf.int32, shape=(batch_size, None))
detection_masks = tf.placeholder(
tf.uint8, shape=(batch_size, None, None, None))
num_gt_masks_per_image = tf.placeholder(tf.int32, shape=(batch_size))
num_det_masks_per_image = tf.placeholder(tf.int32, shape=(batch_size))
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
eval_dict = {
input_data_fields.key: image_id,
input_data_fields.groundtruth_classes: groundtruth_classes,
input_data_fields.groundtruth_instance_masks: groundtruth_masks,
input_data_fields.groundtruth_is_crowd: groundtruth_is_crowd,
input_data_fields.num_groundtruth_boxes: num_gt_masks_per_image,
detection_fields.detection_classes: detection_classes,
detection_fields.detection_masks: detection_masks,
detection_fields.num_detections: num_det_masks_per_image,
}
eval_metric_ops = pq_evaluator.get_estimator_eval_metric_ops(eval_dict)
_, update_op = eval_metric_ops['PanopticQuality@0.50IOU']
(gt_class, gt_masks, gt_is_crowd, result_class,
result_masks) = _get_panoptic_test_data()
with self.test_session() as sess:
sess.run(
update_op,
feed_dict={
image_id: ['image0', 'image1'],
groundtruth_classes:
np.stack([
gt_class[0],
np.pad(gt_class[1], (0, 1), mode='constant')
],
axis=0),
groundtruth_masks:
np.stack([
np.pad(
gt_masks[0], ((0, 0), (0, 10), (0, 10)),
mode='constant'),
np.pad(
gt_masks[1], ((0, 1), (0, 30), (0, 20)),
mode='constant'),
],
axis=0),
groundtruth_is_crowd:
np.stack([
gt_is_crowd[0],
np.pad(gt_is_crowd[1], (0, 1), mode='constant')
],
axis=0),
num_gt_masks_per_image: np.array([3, 2]),
detection_classes:
np.stack([
result_class[0],
np.pad(result_class[1], (0, 1), mode='constant')
],
axis=0),
detection_masks:
np.stack([
np.pad(
result_masks[0], ((0, 0), (0, 10), (0, 10)),
mode='constant'),
np.pad(
result_masks[1], ((0, 1), (0, 30), (0, 20)),
mode='constant'),
],
axis=0),
num_det_masks_per_image: np.array([3, 2]),
})
metrics = {}
for key, (value_op, _) in eval_metric_ops.items():
metrics[key] = value_op
metrics = sess.run(metrics)
self.assertAlmostEqual(metrics['PanopticQuality@0.50IOU'],
(0.32 + 135.0 / 195) / 3)
if __name__ == '__main__':
tf.test.main()
......@@ -52,6 +52,7 @@ from pycocotools import coco
from pycocotools import cocoeval
from pycocotools import mask
import six
from six.moves import range
from six.moves import zip
import tensorflow.compat.v1 as tf
......@@ -353,7 +354,9 @@ def _RleCompress(masks):
Returns:
A pycocotools Run-length encoding of the mask.
"""
return mask.encode(np.asfortranarray(masks))
rle = mask.encode(np.asfortranarray(masks))
rle['counts'] = six.ensure_str(rle['counts'])
return rle
def ExportSingleImageGroundtruthToCoco(image_id,
......
......@@ -36,8 +36,8 @@ import os
import re
import tensorflow.compat.v1 as tf
from object_detection import eval_util
from object_detection.core import standard_fields
from object_detection.legacy import evaluator
from object_detection.metrics import tf_example_parser
from object_detection.utils import config_util
from object_detection.utils import label_map_util
......@@ -94,7 +94,7 @@ def read_data_and_evaluate(input_config, eval_config):
categories = label_map_util.create_categories_from_labelmap(
input_config.label_map_path)
object_detection_evaluators = evaluator.get_evaluators(
object_detection_evaluators = eval_util.get_evaluators(
eval_config, categories)
# Support a single evaluator
object_detection_evaluator = object_detection_evaluators[0]
......
......@@ -20,19 +20,17 @@ from __future__ import print_function
import functools
import os
import unittest
import numpy as np
import tensorflow.compat.v1 as tf
from tensorflow.contrib.tpu.python.tpu import tpu_config
from tensorflow.contrib.tpu.python.tpu import tpu_estimator
from object_detection import inputs
from object_detection import model_hparams
from object_detection import model_lib
from object_detection.builders import model_builder
from object_detection.core import standard_fields as fields
from object_detection.utils import config_util
from object_detection.utils import tf_version
# Model for test. Options are:
......@@ -122,6 +120,7 @@ def _make_initializable_iterator(dataset):
return iterator
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class ModelLibTest(tf.test.TestCase):
@classmethod
......@@ -337,8 +336,7 @@ class ModelLibTest(tf.test.TestCase):
def test_create_tpu_estimator_and_inputs(self):
"""Tests that number of train/eval defaults to config values."""
run_config = tpu_config.RunConfig()
run_config = tf.estimator.tpu.RunConfig()
hparams = model_hparams.create_hparams(
hparams_overrides='load_pretrained=false')
pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
......@@ -352,7 +350,7 @@ class ModelLibTest(tf.test.TestCase):
estimator = train_and_eval_dict['estimator']
train_steps = train_and_eval_dict['train_steps']
self.assertIsInstance(estimator, tpu_estimator.TPUEstimator)
self.assertIsInstance(estimator, tf.estimator.tpu.TPUEstimator)
self.assertEqual(20, train_steps)
def test_create_train_and_eval_specs(self):
......@@ -406,6 +404,7 @@ class ModelLibTest(tf.test.TestCase):
self.assertEqual(None, experiment.eval_steps)
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class UnbatchTensorsTest(tf.test.TestCase):
def test_unbatch_without_unpadding(self):
......
......@@ -20,7 +20,7 @@ from __future__ import print_function
import os
import tempfile
import unittest
import numpy as np
import six
import tensorflow.compat.v1 as tf
......@@ -32,6 +32,7 @@ from object_detection.builders import model_builder
from object_detection.core import model
from object_detection.protos import train_pb2
from object_detection.utils import config_util
from object_detection.utils import tf_version
if six.PY2:
import mock # pylint: disable=g-importing-member,g-import-not-at-top
......@@ -72,6 +73,7 @@ def _get_config_kwarg_overrides():
}
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class ModelLibTest(tf.test.TestCase):
@classmethod
......@@ -139,6 +141,7 @@ class SimpleModel(model.DetectionModel):
return []
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class ModelCheckpointTest(tf.test.TestCase):
"""Test for model checkpoint related functionality."""
......@@ -171,6 +174,7 @@ class IncompatibleModel(SimpleModel):
return {'weight': self.weight}
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CheckpointV2Test(tf.test.TestCase):
def setUp(self):
......
......@@ -358,7 +358,7 @@ def load_fine_tune_checkpoint(
ckpt.restore(checkpoint_path).assert_existing_objects_matched()
def _get_filepath(strategy, filepath):
def get_filepath(strategy, filepath):
"""Get appropriate filepath for worker.
Args:
......@@ -377,7 +377,7 @@ def _get_filepath(strategy, filepath):
return os.path.join(filepath, 'temp_worker_{:03d}'.format(task_id))
def _clean_temporary_directories(strategy, filepath):
def clean_temporary_directories(strategy, filepath):
"""Temporary directory clean up for MultiWorker Mirrored Strategy.
This is needed for all non-chief workers.
......@@ -539,8 +539,8 @@ def train_loop(
## Train the model
# Get the appropriate filepath (temporary or not) based on whether the worker
# is the chief.
summary_writer_filepath = _get_filepath(strategy,
os.path.join(model_dir, 'train'))
summary_writer_filepath = get_filepath(strategy,
os.path.join(model_dir, 'train'))
summary_writer = tf.compat.v2.summary.create_file_writer(
summary_writer_filepath)
......@@ -567,7 +567,7 @@ def train_loop(
ckpt = tf.compat.v2.train.Checkpoint(
step=global_step, model=detection_model, optimizer=optimizer)
manager_dir = _get_filepath(strategy, model_dir)
manager_dir = get_filepath(strategy, model_dir)
if not strategy.extended.should_checkpoint:
checkpoint_max_to_keep = 1
manager = tf.compat.v2.train.CheckpointManager(
......@@ -615,6 +615,10 @@ def train_loop(
return _sample_and_train(strategy, train_step_fn, data_iterator)
train_input_iter = iter(train_input)
if int(global_step.value()) == 0:
manager.save()
checkpointed_step = int(global_step.value())
logged_step = global_step.value()
......@@ -646,8 +650,8 @@ def train_loop(
# Remove the checkpoint directories of the non-chief workers that
# MultiWorkerMirroredStrategy forces us to save during sync distributed
# training.
_clean_temporary_directories(strategy, manager_dir)
_clean_temporary_directories(strategy, summary_writer_filepath)
clean_temporary_directories(strategy, manager_dir)
clean_temporary_directories(strategy, summary_writer_filepath)
def eager_eval_loop(
......
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Creates and runs TF2 object detection models.
##################################
NOTE: This module has not been fully tested; please bear with us while we iron
out the kinks.
##################################
When a TPU device is available, this binary uses TPUStrategy. Otherwise, it uses
GPUS with MirroredStrategy/MultiWorkerMirroredStrategy.
For local training/evaluation run:
PIPELINE_CONFIG_PATH=path/to/pipeline.config
MODEL_DIR=/tmp/model_outputs
NUM_TRAIN_STEPS=10000
SAMPLE_1_OF_N_EVAL_EXAMPLES=1
python model_main_tf2.py -- \
--model_dir=$MODEL_DIR --num_train_steps=$NUM_TRAIN_STEPS \
--sample_1_of_n_eval_examples=$SAMPLE_1_OF_N_EVAL_EXAMPLES \
--pipeline_config_path=$PIPELINE_CONFIG_PATH \
--alsologtostderr
"""
from absl import flags
import tensorflow.compat.v2 as tf
from object_detection import model_hparams
from object_detection import model_lib_v2
flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config '
'file.')
flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.')
flags.DEFINE_bool('eval_on_train_data', False, 'Enable evaluating on train '
'data (only supported in distributed training).')
flags.DEFINE_integer('sample_1_of_n_eval_examples', None, 'Will sample one of '
'every n eval input examples, where n is provided.')
flags.DEFINE_integer('sample_1_of_n_eval_on_train_examples', 5, 'Will sample '
'one of every n train input examples for evaluation, '
'where n is provided. This is only used if '
'`eval_training_data` is True.')
flags.DEFINE_string(
'hparams_overrides', None, 'Hyperparameter overrides, '
'represented as a string containing comma-separated '
'hparam_name=value pairs.')
flags.DEFINE_string(
'model_dir', None, 'Path to output model directory '
'where event and checkpoint files will be written.')
flags.DEFINE_string(
'checkpoint_dir', None, 'Path to directory holding a checkpoint. If '
'`checkpoint_dir` is provided, this binary operates in eval-only mode, '
'writing resulting metrics to `model_dir`.')
flags.DEFINE_integer('eval_timeout', 3600, 'Number of seconds to wait for an'
'evaluation checkpoint before exiting.')
flags.DEFINE_integer(
'num_workers', 1, 'When num_workers > 1, training uses '
'MultiWorkerMirroredStrategy. When num_workers = 1 it uses '
'MirroredStrategy.')
FLAGS = flags.FLAGS
def main(unused_argv):
flags.mark_flag_as_required('model_dir')
flags.mark_flag_as_required('pipeline_config_path')
tf.config.set_soft_device_placement(True)
if FLAGS.checkpoint_dir:
model_lib_v2.eval_continuously(
hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
pipeline_config_path=FLAGS.pipeline_config_path,
model_dir=FLAGS.model_dir,
train_steps=FLAGS.num_train_steps,
sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
sample_1_of_n_eval_on_train_examples=(
FLAGS.sample_1_of_n_eval_on_train_examples),
checkpoint_dir=FLAGS.checkpoint_dir,
wait_interval=300, timeout=FLAGS.eval_timeout)
else:
if tf.config.get_visible_devices('TPU'):
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
elif FLAGS.num_workers > 1:
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
else:
strategy = tf.compat.v2.distribute.MirroredStrategy()
with strategy.scope():
model_lib_v2.train_loop(
hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
pipeline_config_path=FLAGS.pipeline_config_path,
model_dir=FLAGS.model_dir,
train_steps=FLAGS.num_train_steps,
use_tpu=FLAGS.use_tpu)
if __name__ == '__main__':
tf.app.run()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Hourglass[1] feature extractor for CenterNet[2] meta architecture.
[1]: https://arxiv.org/abs/1603.06937
[2]: https://arxiv.org/abs/1904.07850
"""
from object_detection.meta_architectures import center_net_meta_arch
from object_detection.models.keras_models import hourglass_network
class CenterNetHourglassFeatureExtractor(
center_net_meta_arch.CenterNetFeatureExtractor):
"""The hourglass feature extractor for CenterNet.
This class is a thin wrapper around the HourglassFeatureExtractor class
along with some preprocessing methods inherited from the base class.
"""
def __init__(self, hourglass_net, channel_means=(0., 0., 0.),
channel_stds=(1., 1., 1.), bgr_ordering=False):
"""Intializes the feature extractor.
Args:
hourglass_net: The underlying hourglass network to use.
channel_means: A tuple of floats, denoting the mean of each channel
which will be subtracted from it.
channel_stds: A tuple of floats, denoting the standard deviation of each
channel. Each channel will be divided by its standard deviation value.
bgr_ordering: bool, if set will change the channel ordering to be in the
[blue, red, green] order.
"""
super(CenterNetHourglassFeatureExtractor, self).__init__(
channel_means=channel_means, channel_stds=channel_stds,
bgr_ordering=bgr_ordering)
self._network = hourglass_net
def call(self, inputs):
return self._network(inputs)
@property
def out_stride(self):
"""The stride in the output image of the network."""
return 4
@property
def num_feature_outputs(self):
"""Ther number of feature outputs returned by the feature extractor."""
return self._network.num_hourglasses
def get_model(self):
return self._network
def hourglass_104(channel_means, channel_stds, bgr_ordering):
"""The Hourglass-104 backbone for CenterNet."""
network = hourglass_network.hourglass_104()
return CenterNetHourglassFeatureExtractor(
network, channel_means=channel_means, channel_stds=channel_stds,
bgr_ordering=bgr_ordering)
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Testing hourglass feature extractor for CenterNet."""
import unittest
import numpy as np
import tensorflow.compat.v1 as tf
from object_detection.models import center_net_hourglass_feature_extractor as hourglass
from object_detection.models.keras_models import hourglass_network
from object_detection.utils import test_case
from object_detection.utils import tf_version
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CenterNetHourglassFeatureExtractorTest(test_case.TestCase):
def test_center_net_hourglass_feature_extractor(self):
net = hourglass_network.HourglassNetwork(
num_stages=4, blocks_per_stage=[2, 3, 4, 5, 6],
channel_dims=[4, 6, 8, 10, 12, 14], num_hourglasses=2)
model = hourglass.CenterNetHourglassFeatureExtractor(net)
def graph_fn():
return model(tf.zeros((2, 64, 64, 3), dtype=np.float32))
outputs = self.execute(graph_fn, [])
self.assertEqual(outputs[0].shape, (2, 16, 16, 6))
self.assertEqual(outputs[1].shape, (2, 16, 16, 6))
if __name__ == '__main__':
tf.test.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Resnetv2 based feature extractors for CenterNet[1] meta architecture.
[1]: https://arxiv.org/abs/1904.07850
"""
import tensorflow.compat.v1 as tf
from object_detection.meta_architectures.center_net_meta_arch import CenterNetFeatureExtractor
class CenterNetResnetFeatureExtractor(CenterNetFeatureExtractor):
"""Resnet v2 base feature extractor for the CenterNet model."""
def __init__(self, resnet_type, channel_means=(0., 0., 0.),
channel_stds=(1., 1., 1.), bgr_ordering=False):
"""Initializes the feature extractor with a specific ResNet architecture.
Args:
resnet_type: A string specifying which kind of ResNet to use. Currently
only `resnet_v2_50` and `resnet_v2_101` are supported.
channel_means: A tuple of floats, denoting the mean of each channel
which will be subtracted from it.
channel_stds: A tuple of floats, denoting the standard deviation of each
channel. Each channel will be divided by its standard deviation value.
bgr_ordering: bool, if set will change the channel ordering to be in the
[blue, red, green] order.
"""
super(CenterNetResnetFeatureExtractor, self).__init__(
channel_means=channel_means, channel_stds=channel_stds,
bgr_ordering=bgr_ordering)
if resnet_type == 'resnet_v2_101':
self._base_model = tf.keras.applications.ResNet101V2(weights=None)
output_layer = 'conv5_block3_out'
elif resnet_type == 'resnet_v2_50':
self._base_model = tf.keras.applications.ResNet50V2(weights=None)
output_layer = 'conv5_block3_out'
else:
raise ValueError('Unknown Resnet Model {}'.format(resnet_type))
output_layer = self._base_model.get_layer(output_layer)
self._resnet_model = tf.keras.models.Model(inputs=self._base_model.input,
outputs=output_layer.output)
resnet_output = self._resnet_model(self._base_model.input)
for num_filters in [256, 128, 64]:
# TODO(vighneshb) This section has a few differences from the paper
# Figure out how much of a performance impact they have.
# 1. We use a simple convolution instead of a deformable convolution
conv = tf.keras.layers.Conv2D(filters=num_filters, kernel_size=3,
strides=1, padding='same')
resnet_output = conv(resnet_output)
resnet_output = tf.keras.layers.BatchNormalization()(resnet_output)
resnet_output = tf.keras.layers.ReLU()(resnet_output)
# 2. We use the default initialization for the convolution layers
# instead of initializing it to do bilinear upsampling.
conv_transpose = tf.keras.layers.Conv2DTranspose(filters=num_filters,
kernel_size=3, strides=2,
padding='same')
resnet_output = conv_transpose(resnet_output)
resnet_output = tf.keras.layers.BatchNormalization()(resnet_output)
resnet_output = tf.keras.layers.ReLU()(resnet_output)
self._feature_extractor_model = tf.keras.models.Model(
inputs=self._base_model.input, outputs=resnet_output)
def preprocess(self, resized_inputs):
"""Preprocess input images for the ResNet model.
This scales images in the range [0, 255] to the range [-1, 1]
Args:
resized_inputs: a [batch, height, width, channels] float32 tensor.
Returns:
outputs: a [batch, height, width, channels] float32 tensor.
"""
resized_inputs = super(CenterNetResnetFeatureExtractor, self).preprocess(
resized_inputs)
return tf.keras.applications.resnet_v2.preprocess_input(resized_inputs)
def load_feature_extractor_weights(self, path):
self._base_model.load_weights(path)
def get_base_model(self):
"""Get base resnet model for inspection and testing."""
return self._base_model
def call(self, inputs):
"""Returns image features extracted by the backbone.
Args:
inputs: An image tensor of shape [batch_size, input_height,
input_width, 3]
Returns:
features_list: A list of length 1 containing a tensor of shape
[batch_size, input_height // 4, input_width // 4, 64] containing
the features extracted by the ResNet.
"""
return [self._feature_extractor_model(inputs)]
@property
def num_feature_outputs(self):
return 1
@property
def out_stride(self):
return 4
def resnet_v2_101(channel_means, channel_stds, bgr_ordering):
"""The ResNet v2 101 feature extractor."""
return CenterNetResnetFeatureExtractor(
resnet_type='resnet_v2_101',
channel_means=channel_means,
channel_stds=channel_stds,
bgr_ordering=bgr_ordering
)
def resnet_v2_50(channel_means, channel_stds, bgr_ordering):
"""The ResNet v2 50 feature extractor."""
return CenterNetResnetFeatureExtractor(
resnet_type='resnet_v2_50',
channel_means=channel_means,
channel_stds=channel_stds,
bgr_ordering=bgr_ordering)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Testing ResNet v2 models for the CenterNet meta architecture."""
import unittest
import numpy as np
import tensorflow.compat.v1 as tf
from object_detection.models import center_net_resnet_feature_extractor
from object_detection.utils import test_case
from object_detection.utils import tf_version
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class CenterNetResnetFeatureExtractorTest(test_case.TestCase):
def test_output_size(self):
"""Verify that shape of features returned by the backbone is correct."""
model = center_net_resnet_feature_extractor.\
CenterNetResnetFeatureExtractor('resnet_v2_101')
def graph_fn():
img = np.zeros((8, 224, 224, 3), dtype=np.float32)
processed_img = model.preprocess(img)
return model(processed_img)
outputs = self.execute(graph_fn, [])
self.assertEqual(outputs.shape, (8, 56, 56, 64))
def test_output_size_resnet50(self):
"""Verify that shape of features returned by the backbone is correct."""
model = center_net_resnet_feature_extractor.\
CenterNetResnetFeatureExtractor('resnet_v2_50')
def graph_fn():
img = np.zeros((8, 224, 224, 3), dtype=np.float32)
processed_img = model.preprocess(img)
return model(processed_img)
outputs = self.execute(graph_fn, [])
self.assertEqual(outputs.shape, (8, 56, 56, 64))
if __name__ == '__main__':
tf.test.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Resnetv1 FPN [1] based feature extractors for CenterNet[2] meta architecture.
[1]: https://arxiv.org/abs/1612.03144.
[2]: https://arxiv.org/abs/1904.07850.
"""
import tensorflow.compat.v1 as tf
from object_detection.meta_architectures.center_net_meta_arch import CenterNetFeatureExtractor
_RESNET_MODEL_OUTPUT_LAYERS = {
'resnet_v1_50': ['conv2_block3_out', 'conv3_block4_out',
'conv4_block6_out', 'conv5_block3_out'],
'resnet_v1_101': ['conv2_block3_out', 'conv3_block4_out',
'conv4_block23_out', 'conv5_block3_out'],
}
class CenterNetResnetV1FpnFeatureExtractor(CenterNetFeatureExtractor):
"""Resnet v1 FPN base feature extractor for the CenterNet model.
This feature extractor uses residual skip connections and nearest neighbor
upsampling to produce an output feature map of stride 4, which has precise
localization information along with strong semantic information from the top
of the net. This design does not exactly follow the original FPN design,
specifically:
- Since only one output map is necessary for heatmap prediction (stride 4
output), the top-down feature maps can have different numbers of channels.
Specifically, the top down feature maps have the following sizes:
[h/4, w/4, 64], [h/8, w/8, 128], [h/16, w/16, 256], [h/32, w/32, 256].
- No additional coarse features are used after conv5_x.
"""
def __init__(self, resnet_type, channel_means=(0., 0., 0.),
channel_stds=(1., 1., 1.), bgr_ordering=False):
"""Initializes the feature extractor with a specific ResNet architecture.
Args:
resnet_type: A string specifying which kind of ResNet to use. Currently
only `resnet_v1_50` and `resnet_v1_101` are supported.
channel_means: A tuple of floats, denoting the mean of each channel
which will be subtracted from it.
channel_stds: A tuple of floats, denoting the standard deviation of each
channel. Each channel will be divided by its standard deviation value.
bgr_ordering: bool, if set will change the channel ordering to be in the
[blue, red, green] order.
"""
super(CenterNetResnetV1FpnFeatureExtractor, self).__init__(
channel_means=channel_means, channel_stds=channel_stds,
bgr_ordering=bgr_ordering)
if resnet_type == 'resnet_v1_50':
self._base_model = tf.keras.applications.ResNet50(weights=None)
elif resnet_type == 'resnet_v1_101':
self._base_model = tf.keras.applications.ResNet101(weights=None)
else:
raise ValueError('Unknown Resnet Model {}'.format(resnet_type))
output_layers = _RESNET_MODEL_OUTPUT_LAYERS[resnet_type]
outputs = [self._base_model.get_layer(output_layer_name).output
for output_layer_name in output_layers]
self._resnet_model = tf.keras.models.Model(inputs=self._base_model.input,
outputs=outputs)
resnet_outputs = self._resnet_model(self._base_model.input)
# Construct the top-down feature maps.
top_layer = resnet_outputs[-1]
residual_op = tf.keras.layers.Conv2D(filters=256, kernel_size=1,
strides=1, padding='same')
top_down = residual_op(top_layer)
num_filters_list = [256, 128, 64]
for i, num_filters in enumerate(num_filters_list):
level_ind = 2 - i
# Upsample.
upsample_op = tf.keras.layers.UpSampling2D(2, interpolation='nearest')
top_down = upsample_op(top_down)
# Residual (skip-connection) from bottom-up pathway.
residual_op = tf.keras.layers.Conv2D(filters=num_filters, kernel_size=1,
strides=1, padding='same')
residual = residual_op(resnet_outputs[level_ind])
# Merge.
top_down = top_down + residual
next_num_filters = num_filters_list[i+1] if i + 1 <= 2 else 64
conv = tf.keras.layers.Conv2D(filters=next_num_filters,
kernel_size=3, strides=1, padding='same')
top_down = conv(top_down)
top_down = tf.keras.layers.BatchNormalization()(top_down)
top_down = tf.keras.layers.ReLU()(top_down)
self._feature_extractor_model = tf.keras.models.Model(
inputs=self._base_model.input, outputs=top_down)
def preprocess(self, resized_inputs):
"""Preprocess input images for the ResNet model.
This scales images in the range [0, 255] to the range [-1, 1]
Args:
resized_inputs: a [batch, height, width, channels] float32 tensor.
Returns:
outputs: a [batch, height, width, channels] float32 tensor.
"""
resized_inputs = super(
CenterNetResnetV1FpnFeatureExtractor, self).preprocess(resized_inputs)
return tf.keras.applications.resnet.preprocess_input(resized_inputs)
def load_feature_extractor_weights(self, path):
self._base_model.load_weights(path)
def get_base_model(self):
"""Get base resnet model for inspection and testing."""
return self._base_model
def call(self, inputs):
"""Returns image features extracted by the backbone.
Args:
inputs: An image tensor of shape [batch_size, input_height,
input_width, 3]
Returns:
features_list: A list of length 1 containing a tensor of shape
[batch_size, input_height // 4, input_width // 4, 64] containing
the features extracted by the ResNet.
"""
return [self._feature_extractor_model(inputs)]
@property
def num_feature_outputs(self):
return 1
@property
def out_stride(self):
return 4
def resnet_v1_101_fpn(channel_means, channel_stds, bgr_ordering):
"""The ResNet v1 101 FPN feature extractor."""
return CenterNetResnetV1FpnFeatureExtractor(
resnet_type='resnet_v1_101',
channel_means=channel_means,
channel_stds=channel_stds,
bgr_ordering=bgr_ordering
)
def resnet_v1_50_fpn(channel_means, channel_stds, bgr_ordering):
"""The ResNet v1 50 FPN feature extractor."""
return CenterNetResnetV1FpnFeatureExtractor(
resnet_type='resnet_v1_50',
channel_means=channel_means,
channel_stds=channel_stds,
bgr_ordering=bgr_ordering)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment