Commit 7a9934df authored by Zhichao Lu's avatar Zhichao Lu Committed by lzc5123016
Browse files

Merged commit includes the following changes:

184048729  by Zhichao Lu:

    Modify target_assigner so that it creates regression targets taking keypoints into account.

--
184027183  by Zhichao Lu:

    Resnet V1 FPN based feature extractors for SSD meta architecture in Object Detection V2 API.

--
184004730  by Zhichao Lu:

    Expose a lever to override the configured mask_type.

--
183933113  by Zhichao Lu:

    Weight shared convolutional box predictor as described in https://arxiv.org/abs/1708.02002

--
183929669  by Zhichao Lu:

    Expanding box list operations for future data augmentations.

--
183916792  by Zhichao Lu:

    Fix unrecognized assertion function in tests.

--
183906851  by Zhichao Lu:

    - Change ssd meta architecture to use regression weights to compute loss normalizer.

--
183871003  by Zhichao Lu:

    Fix config_util_test wrong dependency.

--
183782120  by Zhichao Lu:

    Add __init__ file to third_party directories.

--
183779109  by Zhichao Lu:

    Setup regular version s...
parent 7ef602be
......@@ -18,6 +18,7 @@ import os
import numpy as np
import six
import tensorflow as tf
from google.protobuf import text_format
from object_detection import exporter
from object_detection.builders import model_builder
from object_detection.core import model
......@@ -37,12 +38,13 @@ class FakeModel(model.DetectionModel):
self._add_detection_masks = add_detection_masks
def preprocess(self, inputs):
return tf.identity(inputs)
true_image_shapes = [] # Doesn't matter for the fake model.
return tf.identity(inputs), true_image_shapes
def predict(self, preprocessed_inputs):
def predict(self, preprocessed_inputs, true_image_shapes):
return {'image': tf.layers.conv2d(preprocessed_inputs, 3, 1)}
def postprocess(self, prediction_dict):
def postprocess(self, prediction_dict, true_image_shapes):
with tf.control_dependencies(prediction_dict.values()):
postprocessed_tensors = {
'detection_boxes': tf.constant([[[0.0, 0.0, 0.5, 0.5],
......@@ -63,7 +65,7 @@ class FakeModel(model.DetectionModel):
def restore_map(self, checkpoint_path, from_detection_checkpoint):
pass
def loss(self, prediction_dict):
def loss(self, prediction_dict, true_image_shapes):
pass
......@@ -74,10 +76,10 @@ class ExportInferenceGraphTest(tf.test.TestCase):
g = tf.Graph()
with g.as_default():
mock_model = FakeModel()
preprocessed_inputs = mock_model.preprocess(
preprocessed_inputs, true_image_shapes = mock_model.preprocess(
tf.placeholder(tf.float32, shape=[None, None, None, 3]))
predictions = mock_model.predict(preprocessed_inputs)
mock_model.postprocess(predictions)
predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
mock_model.postprocess(predictions, true_image_shapes)
if use_moving_averages:
tf.train.ExponentialMovingAverage(0.0).apply()
slim.get_or_create_global_step()
......@@ -213,10 +215,10 @@ class ExportInferenceGraphTest(tf.test.TestCase):
graph = tf.Graph()
with graph.as_default():
fake_model = FakeModel()
preprocessed_inputs = fake_model.preprocess(
preprocessed_inputs, true_image_shapes = fake_model.preprocess(
tf.placeholder(dtype=tf.float32, shape=[None, None, None, 3]))
predictions = fake_model.predict(preprocessed_inputs)
fake_model.postprocess(predictions)
predictions = fake_model.predict(preprocessed_inputs, true_image_shapes)
fake_model.postprocess(predictions, true_image_shapes)
exporter.replace_variable_values_with_moving_averages(
graph, trained_checkpoint_prefix, new_checkpoint_prefix)
......@@ -448,7 +450,7 @@ class ExportInferenceGraphTest(tf.test.TestCase):
masks = inference_graph.get_tensor_by_name('detection_masks:0')
num_detections = inference_graph.get_tensor_by_name('num_detections:0')
with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
'^TensorArray has inconsistent shapes.'):
'TensorArray.*shape'):
sess.run([boxes, scores, classes, masks, num_detections],
feed_dict={image_str_tensor: image_str_batch_np})
......@@ -495,6 +497,31 @@ class ExportInferenceGraphTest(tf.test.TestCase):
self.assertAllClose(masks_np, np.arange(64).reshape([2, 2, 4, 4]))
self.assertAllClose(num_detections_np, [2, 1])
def test_export_graph_saves_pipeline_file(self):
tmp_dir = self.get_temp_dir()
trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt')
self._save_checkpoint_from_mock_model(trained_checkpoint_prefix,
use_moving_averages=True)
output_directory = os.path.join(tmp_dir, 'output')
with mock.patch.object(
model_builder, 'build', autospec=True) as mock_builder:
mock_builder.return_value = FakeModel()
pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
exporter.export_inference_graph(
input_type='image_tensor',
pipeline_config=pipeline_config,
trained_checkpoint_prefix=trained_checkpoint_prefix,
output_directory=output_directory)
expected_pipeline_path = os.path.join(
output_directory, 'pipeline.config')
self.assertTrue(os.path.exists(expected_pipeline_path))
written_pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
with tf.gfile.GFile(expected_pipeline_path, 'r') as f:
proto_str = f.read()
text_format.Merge(proto_str, written_pipeline_config)
self.assertProtoEquals(pipeline_config, written_pipeline_config)
def test_export_saved_model_and_run_inference(self):
tmp_dir = self.get_temp_dir()
trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt')
......
......@@ -4,17 +4,16 @@ The Tensorflow Object Detection API currently supports three evaluation protocol
that can be configured in `EvalConfig` by setting `metrics_set` to the
corresponding value.
## PASCAL VOC 2007 metric
## PASCAL VOC 2007 detection metric
`EvalConfig.metrics_set='pascal_voc_metrics'`
`EvalConfig.metrics_set='pascal_voc_detection_metrics'`
The commonly used mAP metric for evaluating the quality of object detectors, computed according to the protocol of the PASCAL VOC Challenge 2007.
The protocol is available [here](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/devkit_doc_07-Jun-2007.pdf).
## Weighted PASCAL VOC detection metric
## Weighted PASCAL VOC metric
`EvalConfig.metrics_set='weighted_pascal_voc_metrics'`
`EvalConfig.metrics_set='weighted_pascal_voc_detection_metrics'`
The weighted PASCAL metric computes the mean average precision as the average
precision when treating all classes as a single class. In comparison,
......@@ -25,7 +24,21 @@ For example, the test set consists of two classes, "cat" and "dog", and there ar
According to PASCAL VOC 2007 metric, performance on each of the two classes would contribute equally towards the final mAP value,
while for the Weighted PASCAL VOC metric the final mAP value will be influenced by frequency of each class.
## Open Images metric {#open-images}
## PASCAL VOC 2007 instance segmentation metric
`EvalConfig.metrics_set='pascal_voc_instance_segmentation_metrics'`
Similar to pascal voc 2007 detection metric, but computes the intersection over
union based on the object masks instead of object boxes.
## Weighted PASCAL VOC detection metric
`EvalConfig.metrics_set='weighted_pascal_voc_instance_segmentation_metrics'`
Similar to the weighted pascal voc 2007 detection metric, but computes the
intersection over union based on the object masks instead of object boxes.
## Open Images detection metric {#open-images}
`EvalConfig.metrics_set='open_images_metrics'`
......
......@@ -8,7 +8,7 @@ graph proto. A checkpoint will typically consist of three files:
* model.ckpt-${CHECKPOINT_NUMBER}.meta
After you've identified a candidate checkpoint to export, run the following
command from tensorflow/models/research/:
command from tensorflow/models/research:
``` bash
# From tensorflow/models/research/
......
......@@ -308,6 +308,18 @@ python object_detection/export_inference_graph.py \
Afterwards, you should see a directory named `exported_graphs` containing the
SavedModel and frozen graph.
## Configuring the Instance Segmentation Pipeline
Mask prediction can be turned on for an object detection config by adding
`predict_instance_masks: true` within the `MaskRCNNBoxPredictor`. Other
parameters such as mask size, number of convolutions in the mask layer, and the
convolution hyper parameters can be defined. We will use
`mask_rcnn_resnet101_pets.config` as a starting point for configuring the
instance segmentation pipeline. Everything above that was mentioned about object
detection holds true for instance segmentation. Instance segmentation consists
of an object detection model with an additional head that predicts the object
mask inside each predicted box once we remove the training and other details.
## What's Next
Congratulations, you have now trained an object detector for various cats and
......
......@@ -103,7 +103,7 @@ FLAGS = flags.FLAGS
def create_tf_example(example):
# TODO(user): Populate the following variables from your example.
# TODO: Populate the following variables from your example.
height = None # Image height
width = None # Image width
filename = None # Filename of the image. Empty if image is not from file
......@@ -139,7 +139,7 @@ def create_tf_example(example):
def main(_):
writer = tf.python_io.TFRecordWriter(FLAGS.output_path)
# TODO(user): Write code to read in your dataset to examples variable
# TODO: Write code to read in your dataset to examples variable
for example in examples:
tf_example = create_tf_example(example)
......
......@@ -13,7 +13,7 @@ py_library(
srcs = ["detection_inference.py"],
deps = [
"//tensorflow",
"//tensorflow_models/object_detection/core:standard_fields",
"//tensorflow/models/research/object_detection/core:standard_fields",
],
)
......@@ -22,11 +22,11 @@ py_test(
srcs = ["detection_inference_test.py"],
deps = [
":detection_inference",
"//third_party/py/PIL:pil",
"//third_party/py/numpy",
"//PIL:pil",
"//numpy",
"//tensorflow",
"//tensorflow_models/object_detection/core:standard_fields",
"//tensorflow_models/object_detection/utils:dataset_util",
"//tensorflow/models/research/object_detection/core:standard_fields",
"//tensorflow/models/research/object_detection/utils:dataset_util",
],
)
......
......@@ -17,6 +17,7 @@ r"""Tests for detection_inference.py."""
import os
import StringIO
import numpy as np
from PIL import Image
import tensorflow as tf
......
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Model input function for tf-learn object detection model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import tensorflow as tf
from object_detection import trainer
from object_detection.builders import dataset_builder
from object_detection.builders import preprocessor_builder
from object_detection.core import prefetcher
from object_detection.core import standard_fields as fields
from object_detection.data_decoders import tf_example_decoder
from object_detection.protos import eval_pb2
from object_detection.protos import input_reader_pb2
from object_detection.protos import train_pb2
from object_detection.utils import dataset_util
from object_detection.utils import ops as util_ops
FEATURES_IMAGE = 'images'
FEATURES_KEY = 'key'
SERVING_FED_EXAMPLE_KEY = 'serialized_example'
def create_train_input_fn(num_classes, train_config, train_input_config):
"""Creates a train `input` function for `Estimator`.
Args:
num_classes: Number of classes, which does not include a background
category.
train_config: A train_pb2.TrainConfig.
train_input_config: An input_reader_pb2.InputReader.
Returns:
`input_fn` for `Estimator` in TRAIN mode.
"""
def _train_input_fn():
"""Returns `features` and `labels` tensor dictionaries for training.
Returns:
features: Dictionary of feature tensors.
features['images'] is a list of N [1, H, W, C] float32 tensors,
where N is the number of images in a batch.
features['key'] is a list of N string tensors, each representing a
unique identifier for the image.
labels: Dictionary of groundtruth tensors.
labels['locations_list'] is a list of N [num_boxes, 4] float32 tensors
containing the corners of the groundtruth boxes.
labels['classes_list'] is a list of N [num_boxes, num_classes] float32
padded one-hot tensors of classes.
labels['masks_list'] is a list of N [num_boxes, H, W] float32 tensors
containing only binary values, which represent instance masks for
objects if present in the dataset. Else returns None.
labels[fields.InputDataFields.groundtruth_weights] is a list of N
[num_boxes] float32 tensors containing groundtruth weights for the
boxes.
Raises:
TypeError: if the `train_config` or `train_input_config` are not of the
correct type.
"""
if not isinstance(train_config, train_pb2.TrainConfig):
raise TypeError('For training mode, the `train_config` must be a '
'train_pb2.TrainConfig.')
if not isinstance(train_input_config, input_reader_pb2.InputReader):
raise TypeError('The `train_input_config` must be a '
'input_reader_pb2.InputReader.')
def get_next(config):
return dataset_util.make_initializable_iterator(
dataset_builder.build(config)).get_next()
create_tensor_dict_fn = functools.partial(get_next, train_input_config)
data_augmentation_options = [
preprocessor_builder.build(step)
for step in train_config.data_augmentation_options
]
input_queue = trainer.create_input_queue(
batch_size_per_clone=train_config.batch_size,
create_tensor_dict_fn=create_tensor_dict_fn,
batch_queue_capacity=train_config.batch_queue_capacity,
num_batch_queue_threads=train_config.num_batch_queue_threads,
prefetch_queue_capacity=train_config.prefetch_queue_capacity,
data_augmentation_options=data_augmentation_options)
(images_tuple, image_keys, locations_tuple, classes_tuple, masks_tuple,
keypoints_tuple, weights_tuple) = (trainer.get_inputs(
input_queue=input_queue, num_classes=num_classes))
features = {
FEATURES_IMAGE: list(images_tuple),
FEATURES_KEY: list(image_keys)
}
labels = {
'locations_list': list(locations_tuple),
'classes_list': list(classes_tuple)
}
# Make sure that there are no tuple elements with None.
if all(masks is not None for masks in masks_tuple):
labels['masks_list'] = list(masks_tuple)
if all(keypoints is not None for keypoints in keypoints_tuple):
labels['keypoints_list'] = list(keypoints_tuple)
if all((elem is not None for elem in weights_tuple)):
labels[fields.InputDataFields.groundtruth_weights] = list(weights_tuple)
return features, labels
return _train_input_fn
def create_eval_input_fn(num_classes, eval_config, eval_input_config):
"""Creates an eval `input` function for `Estimator`.
Args:
num_classes: Number of classes, which does not include a background
category.
eval_config: An eval_pb2.EvalConfig.
eval_input_config: An input_reader_pb2.InputReader.
Returns:
`input_fn` for `Estimator` in EVAL mode.
"""
def _eval_input_fn():
"""Returns `features` and `labels` tensor dictionaries for evaluation.
Returns:
features: Dictionary of feature tensors.
features['images'] is a [1, H, W, C] float32 tensor.
features['key'] is a string tensor representing a unique identifier for
the image.
labels: Dictionary of groundtruth tensors.
labels['locations_list'] is a list of 1 [num_boxes, 4] float32 tensors
containing the corners of the groundtruth boxes.
labels['classes_list'] is a list of 1 [num_boxes, num_classes] float32
padded one-hot tensors of classes.
labels['masks_list'] is an (optional) list of 1 [num_boxes, H, W]
float32 tensors containing only binary values, which represent
instance masks for objects if present in the dataset. Else returns
None.
labels['image_id_list'] is a list of 1 string tensors containing the
original image id.
labels['area_list'] is a list of 1 [num_boxes] float32 tensors
containing object mask area in pixels squared.
labels['is_crowd_list'] is a list of 1 [num_boxes] bool tensors
indicating if the boxes enclose a crowd.
labels['difficult_list'] is a list of 1 [num_boxes] bool tensors
indicating if the boxes represent `difficult` instances.
Raises:
TypeError: if the `eval_config` or `eval_input_config` are not of the
correct type.
"""
if not isinstance(eval_config, eval_pb2.EvalConfig):
raise TypeError('For eval mode, the `eval_config` must be a '
'eval_pb2.EvalConfig.')
if not isinstance(eval_input_config, input_reader_pb2.InputReader):
raise TypeError('The `eval_input_config` must be a '
'input_reader_pb2.InputReader.')
input_dict = dataset_util.make_initializable_iterator(
dataset_builder.build(eval_input_config)).get_next()
prefetch_queue = prefetcher.prefetch(input_dict, capacity=500)
input_dict = prefetch_queue.dequeue()
original_image = tf.to_float(
tf.expand_dims(input_dict[fields.InputDataFields.image], 0))
features = {}
features[FEATURES_IMAGE] = original_image
features[FEATURES_KEY] = input_dict[fields.InputDataFields.source_id]
labels = {}
labels['locations_list'] = [
input_dict[fields.InputDataFields.groundtruth_boxes]
]
classes_gt = tf.cast(input_dict[fields.InputDataFields.groundtruth_classes],
tf.int32)
classes_gt -= 1 # Remove the label id offset.
labels['classes_list'] = [
util_ops.padded_one_hot_encoding(
indices=classes_gt, depth=num_classes, left_pad=0)
]
labels['image_id_list'] = [input_dict[fields.InputDataFields.source_id]]
labels['area_list'] = [input_dict[fields.InputDataFields.groundtruth_area]]
labels['is_crowd_list'] = [
input_dict[fields.InputDataFields.groundtruth_is_crowd]
]
labels['difficult_list'] = [
input_dict[fields.InputDataFields.groundtruth_difficult]
]
if fields.InputDataFields.groundtruth_instance_masks in input_dict:
labels['masks_list'] = [
input_dict[fields.InputDataFields.groundtruth_instance_masks]
]
return features, labels
return _eval_input_fn
def create_predict_input_fn():
"""Creates a predict `input` function for `Estimator`.
Returns:
`input_fn` for `Estimator` in PREDICT mode.
"""
def _predict_input_fn():
"""Decodes serialized tf.Examples and returns `ServingInputReceiver`.
Returns:
`ServingInputReceiver`.
"""
example = tf.placeholder(dtype=tf.string, shape=[], name='input_feature')
decoder = tf_example_decoder.TfExampleDecoder(load_instance_masks=False)
input_dict = decoder.decode(example)
images = tf.to_float(input_dict[fields.InputDataFields.image])
images = tf.expand_dims(images, axis=0)
return tf.estimator.export.ServingInputReceiver(
features={FEATURES_IMAGE: images},
receiver_tensors={SERVING_FED_EXAMPLE_KEY: example})
return _predict_input_fn
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.tflearn.inputs."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tensorflow as tf
from object_detection import inputs
from object_detection.core import standard_fields as fields
from object_detection.utils import config_util
FLAGS = tf.flags.FLAGS
def _get_configs_for_model(model_name):
"""Returns configurations for model."""
# TODO: Make sure these tests work fine outside google3.
fname = os.path.join(
FLAGS.test_srcdir,
('google3/third_party/tensorflow_models/'
'object_detection/samples/configs/' + model_name + '.config'))
label_map_path = os.path.join(FLAGS.test_srcdir,
('google3/third_party/tensorflow_models/'
'object_detection/data/pet_label_map.pbtxt'))
data_path = os.path.join(FLAGS.test_srcdir,
('google3/third_party/tensorflow_models/'
'object_detection/test_data/pets_examples.record'))
configs = config_util.get_configs_from_pipeline_file(fname)
return config_util.merge_external_params_with_configs(
configs,
train_input_path=data_path,
eval_input_path=data_path,
label_map_path=label_map_path)
class InputsTest(tf.test.TestCase):
def _assert_training_inputs(self, features, labels, num_classes, batch_size):
self.assertEqual(batch_size, len(features['images']))
self.assertEqual(batch_size, len(features['key']))
self.assertEqual(batch_size, len(labels['locations_list']))
self.assertEqual(batch_size, len(labels['classes_list']))
for i in range(batch_size):
image = features['images'][i]
key = features['key'][i]
locations_list = labels['locations_list'][i]
classes_list = labels['classes_list'][i]
weights_list = labels[fields.InputDataFields.groundtruth_weights][i]
self.assertEqual([1, None, None, 3], image.shape.as_list())
self.assertEqual(tf.float32, image.dtype)
self.assertEqual(tf.string, key.dtype)
self.assertEqual([None, 4], locations_list.shape.as_list())
self.assertEqual(tf.float32, locations_list.dtype)
self.assertEqual([None, num_classes], classes_list.shape.as_list())
self.assertEqual(tf.float32, classes_list.dtype)
self.assertEqual([None], weights_list.shape.as_list())
self.assertEqual(tf.float32, weights_list.dtype)
def _assert_eval_inputs(self, features, labels, num_classes):
self.assertEqual(1, len(labels['locations_list']))
self.assertEqual(1, len(labels['classes_list']))
self.assertEqual(1, len(labels['image_id_list']))
self.assertEqual(1, len(labels['area_list']))
self.assertEqual(1, len(labels['is_crowd_list']))
self.assertEqual(1, len(labels['difficult_list']))
image = features['images']
key = features['key']
locations_list = labels['locations_list'][0]
classes_list = labels['classes_list'][0]
image_id_list = labels['image_id_list'][0]
area_list = labels['area_list'][0]
is_crowd_list = labels['is_crowd_list'][0]
difficult_list = labels['difficult_list'][0]
self.assertEqual([1, None, None, 3], image.shape.as_list())
self.assertEqual(tf.float32, image.dtype)
self.assertEqual(tf.string, key.dtype)
self.assertEqual([None, 4], locations_list.shape.as_list())
self.assertEqual(tf.float32, locations_list.dtype)
self.assertEqual([None, num_classes], classes_list.shape.as_list())
self.assertEqual(tf.float32, classes_list.dtype)
self.assertEqual(tf.string, image_id_list.dtype)
self.assertEqual(tf.float32, area_list.dtype)
self.assertEqual(tf.bool, is_crowd_list.dtype)
self.assertEqual(tf.int64, difficult_list.dtype)
def test_faster_rcnn_resnet50_train_input(self):
"""Tests the training input function for FasterRcnnResnet50."""
configs = _get_configs_for_model('faster_rcnn_resnet50_pets')
classes = 37
batch_size = configs['train_config'].batch_size
train_input_fn = inputs.create_train_input_fn(
classes, configs['train_config'], configs['train_input_config'])
features, labels = train_input_fn()
self._assert_training_inputs(features, labels, classes, batch_size)
def test_faster_rcnn_resnet50_eval_input(self):
"""Tests the eval input function for FasterRcnnResnet50."""
configs = _get_configs_for_model('faster_rcnn_resnet50_pets')
classes = 37
eval_input_fn = inputs.create_eval_input_fn(classes, configs['eval_config'],
configs['eval_input_config'])
features, labels = eval_input_fn()
self._assert_eval_inputs(features, labels, classes)
def test_ssd_inceptionV2_train_input(self):
"""Tests the training input function for SSDInceptionV2."""
configs = _get_configs_for_model('ssd_inception_v2_pets')
classes = 37
batch_size = configs['train_config'].batch_size
train_input_fn = inputs.create_train_input_fn(
classes, configs['train_config'], configs['train_input_config'])
features, labels = train_input_fn()
self._assert_training_inputs(features, labels, classes, batch_size)
def test_ssd_inceptionV2_eval_input(self):
"""Tests the eval input function for SSDInceptionV2."""
configs = _get_configs_for_model('ssd_inception_v2_pets')
classes = 37
eval_input_fn = inputs.create_eval_input_fn(classes, configs['eval_config'],
configs['eval_input_config'])
features, labels = eval_input_fn()
self._assert_eval_inputs(features, labels, classes)
def test_predict_input(self):
"""Tests the predict input function."""
predict_input_fn = inputs.create_predict_input_fn()
serving_input_receiver = predict_input_fn()
image = serving_input_receiver.features['images']
receiver_tensors = serving_input_receiver.receiver_tensors[
'serialized_example']
self.assertEqual([1, None, None, 3], image.shape.as_list())
self.assertEqual(tf.float32, image.dtype)
self.assertEqual(tf.string, receiver_tensors.dtype)
def test_error_with_bad_train_config(self):
"""Tests that a TypeError is raised with improper train config."""
configs = _get_configs_for_model('ssd_inception_v2_pets')
classes = 37
train_input_fn = inputs.create_train_input_fn(
num_classes=classes,
train_config=configs['eval_config'], # Expecting `TrainConfig`.
train_input_config=configs['train_input_config'])
with self.assertRaises(TypeError):
train_input_fn()
def test_error_with_bad_train_input_config(self):
"""Tests that a TypeError is raised with improper train input config."""
configs = _get_configs_for_model('ssd_inception_v2_pets')
classes = 37
train_input_fn = inputs.create_train_input_fn(
num_classes=classes,
train_config=configs['train_config'],
train_input_config=configs['model']) # Expecting `InputReader`.
with self.assertRaises(TypeError):
train_input_fn()
def test_error_with_bad_eval_config(self):
"""Tests that a TypeError is raised with improper eval config."""
configs = _get_configs_for_model('ssd_inception_v2_pets')
classes = 37
eval_input_fn = inputs.create_eval_input_fn(
num_classes=classes,
eval_config=configs['train_config'], # Expecting `EvalConfig`.
eval_input_config=configs['eval_input_config'])
with self.assertRaises(TypeError):
eval_input_fn()
def test_error_with_bad_eval_input_config(self):
"""Tests that a TypeError is raised with improper eval input config."""
configs = _get_configs_for_model('ssd_inception_v2_pets')
classes = 37
eval_input_fn = inputs.create_eval_input_fn(
num_classes=classes,
eval_config=configs['eval_config'],
eval_input_config=configs['model']) # Expecting `InputReader`.
with self.assertRaises(TypeError):
eval_input_fn()
if __name__ == '__main__':
tf.test.main()
......@@ -14,7 +14,8 @@ py_library(
],
deps = [
"//tensorflow",
"//tensorflow_models/object_detection/core:matcher",
"//tensorflow/models/research/object_detection/core:matcher",
"//tensorflow/models/research/object_detection/utils:shape_utils",
],
)
......@@ -24,6 +25,7 @@ py_test(
deps = [
":argmax_matcher",
"//tensorflow",
"//tensorflow/models/research/object_detection/utils:test_case",
],
)
......@@ -35,7 +37,7 @@ py_library(
deps = [
"//tensorflow",
"//tensorflow/contrib/image:image_py",
"//tensorflow_models/object_detection/core:matcher",
"//tensorflow/models/research/object_detection/core:matcher",
],
)
......
......@@ -26,10 +26,10 @@ This matcher is used in Fast(er)-RCNN.
Note: matchers are used in TargetAssigners. There is a create_target_assigner
factory function for popular implementations.
"""
import tensorflow as tf
from object_detection.core import matcher
from object_detection.utils import shape_utils
class ArgMaxMatcher(matcher.Matcher):
......@@ -119,7 +119,9 @@ class ArgMaxMatcher(matcher.Matcher):
Returns:
matches: int32 tensor indicating the row each column matches to.
"""
return -1 * tf.ones([tf.shape(similarity_matrix)[1]], dtype=tf.int32)
similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
similarity_matrix)
return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)
def _match_when_rows_are_non_empty():
"""Performs matching when the rows of similarity matrix are non empty.
......@@ -128,7 +130,7 @@ class ArgMaxMatcher(matcher.Matcher):
matches: int32 tensor indicating the row each column matches to.
"""
# Matches for each column
matches = tf.argmax(similarity_matrix, 0)
matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32)
# Deal with matched and unmatched threshold
if self._matched_threshold is not None:
......@@ -156,23 +158,31 @@ class ArgMaxMatcher(matcher.Matcher):
-1)
if self._force_match_for_each_row:
forced_matches_ids = tf.cast(tf.argmax(similarity_matrix, 1), tf.int32)
# Set matches[forced_matches_ids] = [0, ..., R], R is number of rows.
row_range = tf.range(tf.shape(similarity_matrix)[0])
col_range = tf.range(tf.shape(similarity_matrix)[1])
forced_matches_values = tf.cast(row_range, matches.dtype)
keep_matches_ids, _ = tf.setdiff1d(col_range, forced_matches_ids)
keep_matches_values = tf.gather(matches, keep_matches_ids)
matches = tf.dynamic_stitch(
[forced_matches_ids,
keep_matches_ids], [forced_matches_values, keep_matches_values])
return tf.cast(matches, tf.int32)
return tf.cond(
tf.greater(tf.shape(similarity_matrix)[0], 0),
_match_when_rows_are_non_empty, _match_when_rows_are_empty)
similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
similarity_matrix)
force_match_column_ids = tf.argmax(similarity_matrix, 1,
output_type=tf.int32)
force_match_column_indicators = tf.one_hot(
force_match_column_ids, depth=similarity_matrix_shape[1])
force_match_row_ids = tf.argmax(force_match_column_indicators, 0,
output_type=tf.int32)
force_match_column_mask = tf.cast(
tf.reduce_max(force_match_column_indicators, 0), tf.bool)
final_matches = tf.where(force_match_column_mask,
force_match_row_ids, matches)
return final_matches
else:
return matches
if similarity_matrix.shape.is_fully_defined():
if similarity_matrix.shape[0].value == 0:
return _match_when_rows_are_empty()
else:
return _match_when_rows_are_non_empty()
else:
return tf.cond(
tf.greater(tf.shape(similarity_matrix)[0], 0),
_match_when_rows_are_non_empty, _match_when_rows_are_empty)
def _set_values_using_indicator(self, x, indicator, val):
"""Set the indicated fields of x to val.
......
......@@ -19,177 +19,168 @@ import numpy as np
import tensorflow as tf
from object_detection.matchers import argmax_matcher
from object_detection.utils import test_case
class ArgMaxMatcherTest(tf.test.TestCase):
class ArgMaxMatcherTest(test_case.TestCase):
def test_return_correct_matches_with_default_thresholds(self):
def graph_fn(similarity_matrix):
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=None)
match = matcher.match(similarity_matrix)
matched_cols = match.matched_column_indicator()
unmatched_cols = match.unmatched_column_indicator()
match_results = match.match_results
return (matched_cols, unmatched_cols, match_results)
similarity = np.array([[1., 1, 1, 3, 1],
[2, -1, 2, 0, 4],
[3, 0, -1, 0, 0]])
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=None)
[3, 0, -1, 0, 0]], dtype=np.float32)
expected_matched_rows = np.array([2, 0, 1, 0, 1])
(res_matched_cols, res_unmatched_cols,
res_match_results) = self.execute(graph_fn, [similarity])
sim = tf.constant(similarity)
match = matcher.match(sim)
matched_cols = match.matched_column_indices()
matched_rows = match.matched_row_indices()
unmatched_cols = match.unmatched_column_indices()
with self.test_session() as sess:
res_matched_cols = sess.run(matched_cols)
res_matched_rows = sess.run(matched_rows)
res_unmatched_cols = sess.run(unmatched_cols)
self.assertAllEqual(res_matched_rows, expected_matched_rows)
self.assertAllEqual(res_matched_cols, np.arange(similarity.shape[1]))
self.assertEmpty(res_unmatched_cols)
self.assertAllEqual(res_match_results[res_matched_cols],
expected_matched_rows)
self.assertAllEqual(np.nonzero(res_matched_cols)[0], [0, 1, 2, 3, 4])
self.assertFalse(np.all(res_unmatched_cols))
def test_return_correct_matches_with_empty_rows(self):
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=None)
sim = 0.2*tf.ones([0, 5])
match = matcher.match(sim)
unmatched_cols = match.unmatched_column_indices()
with self.test_session() as sess:
res_unmatched_cols = sess.run(unmatched_cols)
self.assertAllEqual(res_unmatched_cols, np.arange(5))
def graph_fn(similarity_matrix):
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=None)
match = matcher.match(similarity_matrix)
return match.unmatched_column_indicator()
similarity = 0.2 * np.ones([0, 5], dtype=np.float32)
res_unmatched_cols = self.execute(graph_fn, [similarity])
self.assertAllEqual(np.nonzero(res_unmatched_cols)[0], np.arange(5))
def test_return_correct_matches_with_matched_threshold(self):
def graph_fn(similarity):
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3.)
match = matcher.match(similarity)
matched_cols = match.matched_column_indicator()
unmatched_cols = match.unmatched_column_indicator()
match_results = match.match_results
return (matched_cols, unmatched_cols, match_results)
similarity = np.array([[1, 1, 1, 3, 1],
[2, -1, 2, 0, 4],
[3, 0, -1, 0, 0]], dtype=np.int32)
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3)
[3, 0, -1, 0, 0]], dtype=np.float32)
expected_matched_cols = np.array([0, 3, 4])
expected_matched_rows = np.array([2, 0, 1])
expected_unmatched_cols = np.array([1, 2])
sim = tf.constant(similarity)
match = matcher.match(sim)
matched_cols = match.matched_column_indices()
matched_rows = match.matched_row_indices()
unmatched_cols = match.unmatched_column_indices()
init_op = tf.global_variables_initializer()
(res_matched_cols, res_unmatched_cols,
match_results) = self.execute(graph_fn, [similarity])
self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows)
self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols)
self.assertAllEqual(np.nonzero(res_unmatched_cols)[0],
expected_unmatched_cols)
with self.test_session() as sess:
sess.run(init_op)
res_matched_cols = sess.run(matched_cols)
res_matched_rows = sess.run(matched_rows)
res_unmatched_cols = sess.run(unmatched_cols)
def test_return_correct_matches_with_matched_and_unmatched_threshold(self):
self.assertAllEqual(res_matched_rows, expected_matched_rows)
self.assertAllEqual(res_matched_cols, expected_matched_cols)
self.assertAllEqual(res_unmatched_cols, expected_unmatched_cols)
def graph_fn(similarity):
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3.,
unmatched_threshold=2.)
match = matcher.match(similarity)
matched_cols = match.matched_column_indicator()
unmatched_cols = match.unmatched_column_indicator()
match_results = match.match_results
return (matched_cols, unmatched_cols, match_results)
def test_return_correct_matches_with_matched_and_unmatched_threshold(self):
similarity = np.array([[1, 1, 1, 3, 1],
[2, -1, 2, 0, 4],
[3, 0, -1, 0, 0]], dtype=np.int32)
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3,
unmatched_threshold=2)
[3, 0, -1, 0, 0]], dtype=np.float32)
expected_matched_cols = np.array([0, 3, 4])
expected_matched_rows = np.array([2, 0, 1])
expected_unmatched_cols = np.array([1]) # col 2 has too high maximum val
sim = tf.constant(similarity)
match = matcher.match(sim)
matched_cols = match.matched_column_indices()
matched_rows = match.matched_row_indices()
unmatched_cols = match.unmatched_column_indices()
(res_matched_cols, res_unmatched_cols,
match_results) = self.execute(graph_fn, [similarity])
self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows)
self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols)
self.assertAllEqual(np.nonzero(res_unmatched_cols)[0],
expected_unmatched_cols)
with self.test_session() as sess:
res_matched_cols = sess.run(matched_cols)
res_matched_rows = sess.run(matched_rows)
res_unmatched_cols = sess.run(unmatched_cols)
def test_return_correct_matches_negatives_lower_than_unmatched_false(self):
self.assertAllEqual(res_matched_rows, expected_matched_rows)
self.assertAllEqual(res_matched_cols, expected_matched_cols)
self.assertAllEqual(res_unmatched_cols, expected_unmatched_cols)
def graph_fn(similarity):
matcher = argmax_matcher.ArgMaxMatcher(
matched_threshold=3.,
unmatched_threshold=2.,
negatives_lower_than_unmatched=False)
match = matcher.match(similarity)
matched_cols = match.matched_column_indicator()
unmatched_cols = match.unmatched_column_indicator()
match_results = match.match_results
return (matched_cols, unmatched_cols, match_results)
def test_return_correct_matches_negatives_lower_than_unmatched_false(self):
similarity = np.array([[1, 1, 1, 3, 1],
[2, -1, 2, 0, 4],
[3, 0, -1, 0, 0]], dtype=np.int32)
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3,
unmatched_threshold=2,
negatives_lower_than_unmatched=False)
[3, 0, -1, 0, 0]], dtype=np.float32)
expected_matched_cols = np.array([0, 3, 4])
expected_matched_rows = np.array([2, 0, 1])
expected_unmatched_cols = np.array([2]) # col 1 has too low maximum val
sim = tf.constant(similarity)
match = matcher.match(sim)
matched_cols = match.matched_column_indices()
matched_rows = match.matched_row_indices()
unmatched_cols = match.unmatched_column_indices()
(res_matched_cols, res_unmatched_cols,
match_results) = self.execute(graph_fn, [similarity])
self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows)
self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols)
self.assertAllEqual(np.nonzero(res_unmatched_cols)[0],
expected_unmatched_cols)
with self.test_session() as sess:
res_matched_cols = sess.run(matched_cols)
res_matched_rows = sess.run(matched_rows)
res_unmatched_cols = sess.run(unmatched_cols)
def test_return_correct_matches_unmatched_row_not_using_force_match(self):
self.assertAllEqual(res_matched_rows, expected_matched_rows)
self.assertAllEqual(res_matched_cols, expected_matched_cols)
self.assertAllEqual(res_unmatched_cols, expected_unmatched_cols)
def graph_fn(similarity):
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3.,
unmatched_threshold=2.)
match = matcher.match(similarity)
matched_cols = match.matched_column_indicator()
unmatched_cols = match.unmatched_column_indicator()
match_results = match.match_results
return (matched_cols, unmatched_cols, match_results)
def test_return_correct_matches_unmatched_row_not_using_force_match(self):
similarity = np.array([[1, 1, 1, 3, 1],
[-1, 0, -2, -2, -1],
[3, 0, -1, 2, 0]], dtype=np.int32)
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3,
unmatched_threshold=2)
[3, 0, -1, 2, 0]], dtype=np.float32)
expected_matched_cols = np.array([0, 3])
expected_matched_rows = np.array([2, 0])
expected_unmatched_cols = np.array([1, 2, 4])
sim = tf.constant(similarity)
match = matcher.match(sim)
matched_cols = match.matched_column_indices()
matched_rows = match.matched_row_indices()
unmatched_cols = match.unmatched_column_indices()
with self.test_session() as sess:
res_matched_cols = sess.run(matched_cols)
res_matched_rows = sess.run(matched_rows)
res_unmatched_cols = sess.run(unmatched_cols)
self.assertAllEqual(res_matched_rows, expected_matched_rows)
self.assertAllEqual(res_matched_cols, expected_matched_cols)
self.assertAllEqual(res_unmatched_cols, expected_unmatched_cols)
(res_matched_cols, res_unmatched_cols,
match_results) = self.execute(graph_fn, [similarity])
self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows)
self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols)
self.assertAllEqual(np.nonzero(res_unmatched_cols)[0],
expected_unmatched_cols)
def test_return_correct_matches_unmatched_row_while_using_force_match(self):
def graph_fn(similarity):
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3.,
unmatched_threshold=2.,
force_match_for_each_row=True)
match = matcher.match(similarity)
matched_cols = match.matched_column_indicator()
unmatched_cols = match.unmatched_column_indicator()
match_results = match.match_results
return (matched_cols, unmatched_cols, match_results)
similarity = np.array([[1, 1, 1, 3, 1],
[-1, 0, -2, -2, -1],
[3, 0, -1, 2, 0]], dtype=np.int32)
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3,
unmatched_threshold=2,
force_match_for_each_row=True)
[3, 0, -1, 2, 0]], dtype=np.float32)
expected_matched_cols = np.array([0, 1, 3])
expected_matched_rows = np.array([2, 1, 0])
expected_unmatched_cols = np.array([2, 4]) # col 2 has too high max val
sim = tf.constant(similarity)
match = matcher.match(sim)
matched_cols = match.matched_column_indices()
matched_rows = match.matched_row_indices()
unmatched_cols = match.unmatched_column_indices()
with self.test_session() as sess:
res_matched_cols = sess.run(matched_cols)
res_matched_rows = sess.run(matched_rows)
res_unmatched_cols = sess.run(unmatched_cols)
self.assertAllEqual(res_matched_rows, expected_matched_rows)
self.assertAllEqual(res_matched_cols, expected_matched_cols)
self.assertAllEqual(res_unmatched_cols, expected_unmatched_cols)
(res_matched_cols, res_unmatched_cols,
match_results) = self.execute(graph_fn, [similarity])
self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows)
self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols)
self.assertAllEqual(np.nonzero(res_unmatched_cols)[0],
expected_unmatched_cols)
def test_valid_arguments_corner_case(self):
argmax_matcher.ArgMaxMatcher(matched_threshold=1,
......@@ -211,27 +202,6 @@ class ArgMaxMatcherTest(tf.test.TestCase):
argmax_matcher.ArgMaxMatcher(matched_threshold=1,
unmatched_threshold=2)
def test_set_values_using_indicator(self):
input_a = np.array([3, 4, 5, 1, 4, 3, 2])
expected_b = np.array([3, 0, 0, 1, 0, 3, 2]) # Set a>3 to 0
expected_c = np.array(
[3., 4., 5., -1., 4., 3., -1.]) # Set a<3 to -1. Float32
idxb_ = input_a > 3
idxc_ = input_a < 3
matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=None)
a = tf.constant(input_a)
idxb = tf.constant(idxb_)
idxc = tf.constant(idxc_)
b = matcher._set_values_using_indicator(a, idxb, 0)
c = matcher._set_values_using_indicator(tf.cast(a, tf.float32), idxc, -1)
with self.test_session() as sess:
res_b = sess.run(b)
res_c = sess.run(c)
self.assertAllEqual(res_b, expected_b)
self.assertAllEqual(res_c, expected_c)
if __name__ == '__main__':
tf.test.main()
......@@ -27,8 +27,8 @@ class GreedyBipartiteMatcher(matcher.Matcher):
def _match(self, similarity_matrix, num_valid_rows=-1):
"""Bipartite matches a collection rows and columns. A greedy bi-partite.
TODO: Add num_valid_columns options to match only that many columns with
all the rows.
TODO: Add num_valid_columns options to match only that many columns
with all the rows.
Args:
similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
......
......@@ -13,12 +13,14 @@ py_library(
srcs = ["ssd_meta_arch.py"],
deps = [
"//tensorflow",
"//tensorflow_models/object_detection/core:box_list",
"//tensorflow_models/object_detection/core:box_predictor",
"//tensorflow_models/object_detection/core:model",
"//tensorflow_models/object_detection/core:target_assigner",
"//tensorflow_models/object_detection/utils:shape_utils",
"//tensorflow_models/object_detection/utils:visualization_utils",
"//tensorflow/models/research/object_detection/core:box_list",
"//tensorflow/models/research/object_detection/core:box_predictor",
"//tensorflow/models/research/object_detection/core:model",
"//tensorflow/models/research/object_detection/core:target_assigner",
"//tensorflow/models/research/object_detection/utils:ops",
"//tensorflow/models/research/object_detection/utils:shape_utils",
"//tensorflow/models/research/object_detection/utils:test_case",
"//tensorflow/models/research/object_detection/utils:visualization_utils",
],
)
......@@ -28,13 +30,12 @@ py_test(
deps = [
":ssd_meta_arch",
"//tensorflow",
"//tensorflow/python:training",
"//tensorflow_models/object_detection/core:anchor_generator",
"//tensorflow_models/object_detection/core:box_list",
"//tensorflow_models/object_detection/core:losses",
"//tensorflow_models/object_detection/core:post_processing",
"//tensorflow_models/object_detection/core:region_similarity_calculator",
"//tensorflow_models/object_detection/utils:test_utils",
"//tensorflow/models/research/object_detection/core:anchor_generator",
"//tensorflow/models/research/object_detection/core:box_list",
"//tensorflow/models/research/object_detection/core:losses",
"//tensorflow/models/research/object_detection/core:post_processing",
"//tensorflow/models/research/object_detection/core:region_similarity_calculator",
"//tensorflow/models/research/object_detection/utils:test_utils",
],
)
......@@ -45,18 +46,18 @@ py_library(
],
deps = [
"//tensorflow",
"//tensorflow_models/object_detection/anchor_generators:grid_anchor_generator",
"//tensorflow_models/object_detection/core:balanced_positive_negative_sampler",
"//tensorflow_models/object_detection/core:box_list",
"//tensorflow_models/object_detection/core:box_list_ops",
"//tensorflow_models/object_detection/core:box_predictor",
"//tensorflow_models/object_detection/core:losses",
"//tensorflow_models/object_detection/core:model",
"//tensorflow_models/object_detection/core:post_processing",
"//tensorflow_models/object_detection/core:standard_fields",
"//tensorflow_models/object_detection/core:target_assigner",
"//tensorflow_models/object_detection/utils:ops",
"//tensorflow_models/object_detection/utils:shape_utils",
"//tensorflow/models/research/object_detection/anchor_generators:grid_anchor_generator",
"//tensorflow/models/research/object_detection/core:balanced_positive_negative_sampler",
"//tensorflow/models/research/object_detection/core:box_list",
"//tensorflow/models/research/object_detection/core:box_list_ops",
"//tensorflow/models/research/object_detection/core:box_predictor",
"//tensorflow/models/research/object_detection/core:losses",
"//tensorflow/models/research/object_detection/core:model",
"//tensorflow/models/research/object_detection/core:post_processing",
"//tensorflow/models/research/object_detection/core:standard_fields",
"//tensorflow/models/research/object_detection/core:target_assigner",
"//tensorflow/models/research/object_detection/utils:ops",
"//tensorflow/models/research/object_detection/utils:shape_utils",
],
)
......@@ -68,14 +69,14 @@ py_library(
deps = [
":faster_rcnn_meta_arch",
"//tensorflow",
"//tensorflow_models/object_detection/anchor_generators:grid_anchor_generator",
"//tensorflow_models/object_detection/builders:box_predictor_builder",
"//tensorflow_models/object_detection/builders:hyperparams_builder",
"//tensorflow_models/object_detection/builders:post_processing_builder",
"//tensorflow_models/object_detection/core:losses",
"//tensorflow_models/object_detection/protos:box_predictor_py_pb2",
"//tensorflow_models/object_detection/protos:hyperparams_py_pb2",
"//tensorflow_models/object_detection/protos:post_processing_py_pb2",
"//tensorflow/models/research/object_detection/anchor_generators:grid_anchor_generator",
"//tensorflow/models/research/object_detection/builders:box_predictor_builder",
"//tensorflow/models/research/object_detection/builders:hyperparams_builder",
"//tensorflow/models/research/object_detection/builders:post_processing_builder",
"//tensorflow/models/research/object_detection/core:losses",
"//tensorflow/models/research/object_detection/protos:box_predictor_py_pb2",
"//tensorflow/models/research/object_detection/protos:hyperparams_py_pb2",
"//tensorflow/models/research/object_detection/protos:post_processing_py_pb2",
],
)
......@@ -93,8 +94,8 @@ py_library(
deps = [
":faster_rcnn_meta_arch",
"//tensorflow",
"//tensorflow_models/object_detection/core:box_predictor",
"//tensorflow_models/object_detection/utils:ops",
"//tensorflow/models/research/object_detection/core:box_predictor",
"//tensorflow/models/research/object_detection/utils:ops",
],
)
......
......@@ -21,13 +21,17 @@ See Faster R-CNN: Ren, Shaoqing, et al.
"Faster R-CNN: Towards real-time object detection with region proposal
networks." Advances in neural information processing systems. 2015.
We allow for two modes: first_stage_only=True and first_stage_only=False. In
the former setting, all of the user facing methods (e.g., predict, postprocess,
loss) can be used as if the model consisted only of the RPN, returning class
agnostic proposals (these can be thought of as approximate detections with no
associated class information). In the latter setting, proposals are computed,
then passed through a second stage "box classifier" to yield (multi-class)
detections.
We allow for three modes: number_of_stages={1, 2, 3}. In case of 1 stage,
all of the user facing methods (e.g., predict, postprocess, loss) can be used as
if the model consisted only of the RPN, returning class agnostic proposals
(these can be thought of as approximate detections with no associated class
information). In case of 2 stages, proposals are computed, then passed
through a second stage "box classifier" to yield (multi-class) detections.
Finally, in case of 3 stages which is only used during eval, proposals are
computed, then passed through a second stage "box classifier" that will compute
refined boxes and classes, and then features are pooled from the refined and
non-maximum suppressed boxes and are passed through the box classifier again. If
number of stages is 3 during training it will be reduced to two automatically.
Implementations of Faster R-CNN models must define a new
FasterRCNNFeatureExtractor and override three methods: `preprocess`,
......@@ -62,6 +66,32 @@ Following the API (see model.DetectionModel definition), our outputs after
postprocessing operations are always normalized boxes however, internally, we
sometimes convert to absolute --- e.g. for loss computation. In particular,
anchors and proposal_boxes are both represented as absolute coordinates.
Images are resized in the `preprocess` method.
The Faster R-CNN meta architecture has two post-processing methods
`_postprocess_rpn` which is applied after first stage and
`_postprocess_box_classifier` which is applied after second stage. There are
three different ways post-processing can happen depending on number_of_stages
configured in the meta architecture:
1. When number_of_stages is 1:
`_postprocess_rpn` is run as part of the `postprocess` method where
true_image_shapes is used to clip proposals, perform non-max suppression and
normalize them.
2. When number of stages is 2:
`_postprocess_rpn` is run as part of the `_predict_second_stage` method where
`resized_image_shapes` is used to clip proposals, perform non-max suppression
and normalize them. In this case `postprocess` method skips `_postprocess_rpn`
and only runs `_postprocess_box_classifier` using `true_image_shapes` to clip
detections, perform non-max suppression and normalize them.
3. When number of stages is 3:
`_postprocess_rpn` is run as part of the `_predict_second_stage` using
`resized_image_shapes` to clip proposals, perform non-max suppression and
normalize them. Subsequently, `_postprocess_box_classifier` is run as part of
`_predict_third_stage` using `true_image_shapes` to clip detections, peform
non-max suppression and normalize them. In this case, the `postprocess` method
skips both `_postprocess_rpn` and `_postprocess_box_classifier`.
"""
from abc import abstractmethod
from functools import partial
......@@ -152,7 +182,8 @@ class FasterRCNNFeatureExtractor(object):
[batch_size * self.max_num_proposals, height, width, depth]
representing box classifier features for each proposal.
"""
with tf.variable_scope(scope, values=[proposal_feature_maps]):
with tf.variable_scope(
scope, values=[proposal_feature_maps], reuse=tf.AUTO_REUSE):
return self._extract_box_classifier_features(proposal_feature_maps, scope)
@abstractmethod
......@@ -194,7 +225,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
num_classes,
image_resizer_fn,
feature_extractor,
first_stage_only,
number_of_stages,
first_stage_anchor_generator,
first_stage_atrous_rate,
first_stage_box_predictor_arg_scope,
......@@ -232,12 +263,22 @@ class FasterRCNNMetaArch(model.DetectionModel):
assigned classification targets can range from {0,... K}).
image_resizer_fn: A callable for image resizing. This callable
takes a rank-3 image tensor of shape [height, width, channels]
(corresponding to a single image) and returns a rank-3 image tensor,
possibly with new spatial dimensions. See
builders/image_resizer_builder.py.
(corresponding to a single image), an optional rank-3 instance mask
tensor of shape [num_masks, height, width] and returns a resized rank-3
image tensor, a resized mask tensor if one was provided in the input. In
addition this callable must also return a 1-D tensor of the form
[height, width, channels] containing the size of the true image, as the
image resizer can perform zero padding. See protos/image_resizer.proto.
feature_extractor: A FasterRCNNFeatureExtractor object.
first_stage_only: Whether to construct only the Region Proposal Network
(RPN) part of the model.
number_of_stages: An integer values taking values in {1, 2, 3}. If
1, the function will construct only the Region Proposal Network (RPN)
part of the model. If 2, the function will perform box refinement and
other auxiliary predictions all in the second stage. If 3, it will
extract features from refined boxes and perform the auxiliary
predictions on the non-maximum suppressed refined boxes.
If is_training is true and the value of number_of_stages is 3, it is
reduced to 2 since all the model heads are trained in parallel in second
stage during training.
first_stage_anchor_generator: An anchor_generator.AnchorGenerator object
(note that currently we only support
grid_anchor_generator.GridAnchorGenerator objects)
......@@ -333,7 +374,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
self._is_training = is_training
self._image_resizer_fn = image_resizer_fn
self._feature_extractor = feature_extractor
self._first_stage_only = first_stage_only
self._number_of_stages = number_of_stages
# The first class is reserved as background.
unmatched_cls_target = tf.constant(
......@@ -368,9 +409,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
self._first_stage_max_proposals = first_stage_max_proposals
self._first_stage_localization_loss = (
losses.WeightedSmoothL1LocalizationLoss(anchorwise_output=True))
losses.WeightedSmoothL1LocalizationLoss())
self._first_stage_objectness_loss = (
losses.WeightedSoftmaxClassificationLoss(anchorwise_output=True))
losses.WeightedSoftmaxClassificationLoss())
self._first_stage_loc_loss_weight = first_stage_localization_loss_weight
self._first_stage_obj_loss_weight = first_stage_objectness_loss_weight
......@@ -389,10 +430,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
self._second_stage_score_conversion_fn = second_stage_score_conversion_fn
self._second_stage_localization_loss = (
losses.WeightedSmoothL1LocalizationLoss(anchorwise_output=True))
losses.WeightedSmoothL1LocalizationLoss())
self._second_stage_classification_loss = second_stage_classification_loss
self._second_stage_mask_loss = (
losses.WeightedSigmoidClassificationLoss(anchorwise_output=True))
losses.WeightedSigmoidClassificationLoss())
self._second_stage_loc_loss_weight = second_stage_localization_loss_weight
self._second_stage_cls_loss_weight = second_stage_classification_loss_weight
self._second_stage_mask_loss_weight = (
......@@ -400,6 +441,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
self._hard_example_miner = hard_example_miner
self._parallel_iterations = parallel_iterations
if self._number_of_stages <= 0 or self._number_of_stages > 3:
raise ValueError('Number of stages should be a value in {1, 2, 3}.')
if self._is_training and self._number_of_stages == 3:
self._number_of_stages = 2
@property
def first_stage_feature_extractor_scope(self):
return 'FirstStageFeatureExtractor'
......@@ -432,6 +478,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
return self._second_stage_batch_size
return self._first_stage_max_proposals
@property
def anchors(self):
if not self._anchors:
raise RuntimeError('anchors have not been constructed yet!')
if not isinstance(self._anchors, box_list.BoxList):
raise RuntimeError('anchors should be a BoxList object, but is not.')
return self._anchors
def preprocess(self, inputs):
"""Feature-extractor specific preprocessing.
......@@ -448,24 +502,53 @@ class FasterRCNNMetaArch(model.DetectionModel):
Returns:
preprocessed_inputs: a [batch, height_out, width_out, channels] float
tensor representing a batch of images.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Raises:
ValueError: if inputs tensor does not have type tf.float32
"""
if inputs.dtype is not tf.float32:
raise ValueError('`preprocess` expects a tf.float32 tensor')
with tf.name_scope('Preprocessor'):
resized_inputs = tf.map_fn(self._image_resizer_fn,
elems=inputs,
dtype=tf.float32,
parallel_iterations=self._parallel_iterations)
return self._feature_extractor.preprocess(resized_inputs)
outputs = shape_utils.static_or_dynamic_map_fn(
self._image_resizer_fn,
elems=inputs,
dtype=[tf.float32, tf.int32],
parallel_iterations=self._parallel_iterations)
resized_inputs = outputs[0]
true_image_shapes = outputs[1]
return (self._feature_extractor.preprocess(resized_inputs),
true_image_shapes)
def _compute_clip_window(self, image_shapes):
"""Computes clip window for non max suppression based on image shapes.
def predict(self, preprocessed_inputs):
This function assumes that the clip window's left top corner is at (0, 0).
Args:
image_shapes: A 2-D int32 tensor of shape [batch_size, 3] containing
shapes of images in the batch. Each row represents [height, width,
channels] of an image.
Returns:
A 2-D float32 tensor of shape [batch_size, 4] containing the clip window
for each image in the form [ymin, xmin, ymax, xmax].
"""
clip_heights = image_shapes[:, 0]
clip_widths = image_shapes[:, 1]
clip_window = tf.to_float(tf.stack([tf.zeros_like(clip_heights),
tf.zeros_like(clip_heights),
clip_heights, clip_widths], axis=1))
return clip_window
def predict(self, preprocessed_inputs, true_image_shapes):
"""Predicts unpostprocessed tensors from input tensor.
This function takes an input batch of images and runs it through the
forward pass of the network to yield "raw" un-postprocessed predictions.
If `first_stage_only` is True, this function only returns first stage
If `number_of_stages` is 1, this function only returns first stage
RPN predictions (un-postprocessed). Otherwise it returns both
first stage RPN predictions as well as second stage box classifier
predictions.
......@@ -481,6 +564,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
......@@ -504,7 +591,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
`num_anchors` can differ depending on whether the model is created in
training or inference mode.
(and if first_stage_only=False):
(and if number_of_stages=1):
7) refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, 4] representing predicted
(final) refined box encodings, where
......@@ -526,6 +613,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
11) mask_predictions: (optional) a 4-D tensor with shape
[total_num_padded_proposals, num_classes, mask_height, mask_width]
containing instance mask predictions.
Raises:
ValueError: If `predict` is called before `preprocess`.
"""
(rpn_box_predictor_features, rpn_features_to_crop, anchors_boxlist,
image_shape) = self._extract_rpn_feature_maps(preprocessed_inputs)
......@@ -544,7 +634,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
anchors_boxlist = box_list_ops.clip_to_window(
anchors_boxlist, clip_window)
anchors = anchors_boxlist.get()
self._anchors = anchors_boxlist
prediction_dict = {
'rpn_box_predictor_features': rpn_box_predictor_features,
'rpn_features_to_crop': rpn_features_to_crop,
......@@ -552,22 +642,46 @@ class FasterRCNNMetaArch(model.DetectionModel):
'rpn_box_encodings': rpn_box_encodings,
'rpn_objectness_predictions_with_background':
rpn_objectness_predictions_with_background,
'anchors': anchors
'anchors': self._anchors.get()
}
if not self._first_stage_only:
if self._number_of_stages >= 2:
prediction_dict.update(self._predict_second_stage(
rpn_box_encodings,
rpn_objectness_predictions_with_background,
rpn_features_to_crop,
anchors, image_shape))
self._anchors.get(), image_shape, true_image_shapes))
if self._number_of_stages == 3:
prediction_dict = self._predict_third_stage(
prediction_dict, true_image_shapes)
return prediction_dict
def _image_batch_shape_2d(self, image_batch_shape_1d):
"""Takes a 1-D image batch shape tensor and converts it to a 2-D tensor.
Example:
If 1-D image batch shape tensor is [2, 300, 300, 3]. The corresponding 2-D
image batch tensor would be [[300, 300, 3], [300, 300, 3]]
Args:
image_batch_shape_1d: 1-D tensor of the form [batch_size, height,
width, channels].
Returns:
image_batch_shape_2d: 2-D tensor of shape [batch_size, 3] were each row is
of the form [height, width, channels].
"""
return tf.tile(tf.expand_dims(image_batch_shape_1d[1:], 0),
[image_batch_shape_1d[0], 1])
def _predict_second_stage(self, rpn_box_encodings,
rpn_objectness_predictions_with_background,
rpn_features_to_crop,
anchors,
image_shape):
image_shape,
true_image_shapes):
"""Predicts the output tensors from second stage of Faster R-CNN.
Args:
......@@ -584,6 +698,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
anchors: 2-D float tensor of shape
[num_anchors, self._box_coder.code_size].
image_shape: A 1D int32 tensors of size [4] containing the image shape.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
......@@ -617,9 +735,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
[total_num_padded_proposals, num_classes, mask_height, mask_width]
containing instance mask predictions.
"""
image_shape_2d = self._image_batch_shape_2d(image_shape)
proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn(
rpn_box_encodings, rpn_objectness_predictions_with_background,
anchors, image_shape)
anchors, image_shape_2d, true_image_shapes)
flattened_proposal_feature_maps = (
self._compute_second_stage_input_feature_maps(
......@@ -630,10 +749,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
flattened_proposal_feature_maps,
scope=self.second_stage_feature_extractor_scope))
predict_auxiliary_outputs = False
if self._number_of_stages == 2:
predict_auxiliary_outputs = True
box_predictions = self._mask_rcnn_box_predictor.predict(
box_classifier_features,
num_predictions_per_location=1,
scope=self.second_stage_box_predictor_scope)
[box_classifier_features],
num_predictions_per_location=[1],
scope=self.second_stage_box_predictor_scope,
predict_boxes_and_classes=True,
predict_auxiliary_outputs=predict_auxiliary_outputs)
refined_box_encodings = tf.squeeze(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
class_predictions_with_background = tf.squeeze(box_predictions[
......@@ -658,6 +783,100 @@ class FasterRCNNMetaArch(model.DetectionModel):
return prediction_dict
def _predict_third_stage(self, prediction_dict, image_shapes):
"""Predicts non-box, non-class outputs using refined detections.
Args:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, 4] representing predicted
(final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals
2) class_predictions_with_background: a 3-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
total_num_proposals=batch_size*self._max_num_proposals.
Note that this tensor *includes* background class predictions
(at class index 0).
3) num_proposals: An int32 tensor of shape [batch_size] representing the
number of proposals generated by the RPN. `num_proposals` allows us
to keep track of which entries are to be treated as zero paddings and
which are not since we always pad the number of proposals to be
`self.max_num_proposals` for each image.
4) proposal_boxes: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes in absolute coordinates.
image_shapes: A 2-D int32 tensors of shape [batch_size, 3] containing
shapes of images in the batch.
Returns:
prediction_dict: a dictionary that in addition to the input predictions
does hold the following predictions as well:
1) mask_predictions: (optional) a 4-D tensor with shape
[batch_size, max_detection, mask_height, mask_width] containing
instance mask predictions.
"""
detections_dict = self._postprocess_box_classifier(
prediction_dict['refined_box_encodings'],
prediction_dict['class_predictions_with_background'],
prediction_dict['proposal_boxes'],
prediction_dict['num_proposals'],
image_shapes)
prediction_dict.update(detections_dict)
detection_boxes = detections_dict[
fields.DetectionResultFields.detection_boxes]
detection_classes = detections_dict[
fields.DetectionResultFields.detection_classes]
rpn_features_to_crop = prediction_dict['rpn_features_to_crop']
batch_size = tf.shape(detection_boxes)[0]
max_detection = tf.shape(detection_boxes)[1]
flattened_detected_feature_maps = (
self._compute_second_stage_input_feature_maps(
rpn_features_to_crop, detection_boxes))
detected_box_classifier_features = (
self._feature_extractor.extract_box_classifier_features(
flattened_detected_feature_maps,
scope=self.second_stage_feature_extractor_scope))
box_predictions = self._mask_rcnn_box_predictor.predict(
[detected_box_classifier_features],
num_predictions_per_location=[1],
scope=self.second_stage_box_predictor_scope,
predict_boxes_and_classes=False,
predict_auxiliary_outputs=True)
if box_predictor.MASK_PREDICTIONS in box_predictions:
detection_masks = tf.squeeze(box_predictions[
box_predictor.MASK_PREDICTIONS], axis=1)
detection_masks = self._gather_instance_masks(detection_masks,
detection_classes)
mask_height = tf.shape(detection_masks)[1]
mask_width = tf.shape(detection_masks)[2]
prediction_dict[fields.DetectionResultFields.detection_masks] = (
tf.reshape(detection_masks,
[batch_size, max_detection, mask_height, mask_width]))
return prediction_dict
def _gather_instance_masks(self, instance_masks, classes):
"""Gathers the masks that correspond to classes.
Args:
instance_masks: A 4-D float32 tensor with shape
[K, num_classes, mask_height, mask_width].
classes: A 2-D int32 tensor with shape [batch_size, max_detection].
Returns:
masks: a 3-D float32 tensor with shape [K, mask_height, mask_width].
"""
k = tf.shape(instance_masks)[0]
num_mask_classes = tf.shape(instance_masks)[1]
instance_mask_height = tf.shape(instance_masks)[2]
instance_mask_width = tf.shape(instance_masks)[3]
classes = tf.reshape(classes, [-1])
instance_masks = tf.reshape(instance_masks, [
-1, instance_mask_height, instance_mask_width
])
return tf.gather(instance_masks,
tf.range(k) * num_mask_classes + tf.to_int32(classes))
def _extract_rpn_feature_maps(self, preprocessed_inputs):
"""Extracts RPN features.
......@@ -728,8 +947,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
raise RuntimeError('anchor_generator is expected to generate anchors '
'corresponding to a single feature map.')
box_predictions = self._first_stage_box_predictor.predict(
rpn_box_predictor_features,
num_anchors_per_location[0],
[rpn_box_predictor_features],
num_anchors_per_location,
scope=self.first_stage_box_predictor_scope)
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
......@@ -776,7 +995,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
pruned_anchors_boxlist, keep_indices = box_list_ops.prune_outside_window(
anchors_boxlist, clip_window)
def _batch_gather_kept_indices(predictions_tensor):
return tf.map_fn(
return shape_utils.static_or_dynamic_map_fn(
partial(tf.gather, indices=keep_indices),
elems=predictions_tensor,
dtype=tf.float32,
......@@ -804,7 +1023,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
combined_shape[2:])
return tf.reshape(inputs, flattened_shape)
def postprocess(self, prediction_dict):
def postprocess(self, prediction_dict, true_image_shapes):
"""Convert prediction tensors to final detections.
This function converts raw predictions tensors to final detection results.
......@@ -812,20 +1031,24 @@ class FasterRCNNMetaArch(model.DetectionModel):
scores are to be interpreted as logits, but if a score_converter is used,
then scores are remapped (and may thus have a different interpretation).
If first_stage_only=True, the returned results represent proposals from the
If number_of_stages=1, the returned results represent proposals from the
first stage RPN and are padded to have self.max_num_proposals for each
image; otherwise, the results can be interpreted as multiclass detections
from the full two-stage model and are padded to self._max_detections.
Args:
prediction_dict: a dictionary holding prediction tensors (see the
documentation for the predict method. If first_stage_only=True, we
documentation for the predict method. If number_of_stages=1, we
expect prediction_dict to contain `rpn_box_encodings`,
`rpn_objectness_predictions_with_background`, `rpn_features_to_crop`,
`image_shape`, and `anchors` fields. Otherwise we expect
prediction_dict to additionally contain `refined_box_encodings`,
and `anchors` fields. Otherwise we expect prediction_dict to
additionally contain `refined_box_encodings`,
`class_predictions_with_background`, `num_proposals`,
`proposal_boxes` and, optionally, `mask_predictions` fields.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Returns:
detections: a dictionary containing the following fields
......@@ -834,36 +1057,55 @@ class FasterRCNNMetaArch(model.DetectionModel):
detection_classes: [batch, max_detections]
(this entry is only created if rpn_mode=False)
num_detections: [batch]
Raises:
ValueError: If `predict` is called before `preprocess`.
"""
with tf.name_scope('FirstStagePostprocessor'):
image_shape = prediction_dict['image_shape']
if self._first_stage_only:
if self._number_of_stages == 1:
proposal_boxes, proposal_scores, num_proposals = self._postprocess_rpn(
prediction_dict['rpn_box_encodings'],
prediction_dict['rpn_objectness_predictions_with_background'],
prediction_dict['anchors'],
image_shape)
true_image_shapes,
true_image_shapes)
return {
'detection_boxes': proposal_boxes,
'detection_scores': proposal_scores,
'num_detections': tf.to_float(num_proposals)
fields.DetectionResultFields.detection_boxes: proposal_boxes,
fields.DetectionResultFields.detection_scores: proposal_scores,
fields.DetectionResultFields.num_detections:
tf.to_float(num_proposals),
}
with tf.name_scope('SecondStagePostprocessor'):
mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS)
detections_dict = self._postprocess_box_classifier(
prediction_dict['refined_box_encodings'],
prediction_dict['class_predictions_with_background'],
prediction_dict['proposal_boxes'],
prediction_dict['num_proposals'],
image_shape,
mask_predictions=mask_predictions)
if self._number_of_stages == 2:
mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS)
detections_dict = self._postprocess_box_classifier(
prediction_dict['refined_box_encodings'],
prediction_dict['class_predictions_with_background'],
prediction_dict['proposal_boxes'],
prediction_dict['num_proposals'],
true_image_shapes,
mask_predictions=mask_predictions)
return detections_dict
if self._number_of_stages == 3:
# Post processing is already performed in 3rd stage. We need to transfer
# postprocessed tensors from `prediction_dict` to `detections_dict`.
detections_dict = {}
for key in prediction_dict:
if key == fields.DetectionResultFields.detection_masks:
detections_dict[key] = tf.sigmoid(prediction_dict[key])
elif 'detection' in key:
detections_dict[key] = prediction_dict[key]
return detections_dict
def _postprocess_rpn(self,
rpn_box_encodings_batch,
rpn_objectness_predictions_with_background_batch,
anchors,
image_shape):
image_shapes,
true_image_shapes):
"""Converts first stage prediction tensors from the RPN to proposals.
This function decodes the raw RPN predictions, runs non-max suppression
......@@ -885,7 +1127,12 @@ class FasterRCNNMetaArch(model.DetectionModel):
anchors: A 2-D tensor of shape [num_anchors, 4] representing anchors
for the first stage RPN. Note that `num_anchors` can differ depending
on whether the model is created in training or inference mode.
image_shape: A 1-D tensor representing the input image shape.
image_shapes: A 2-D tensor of shape [batch, 3] containing the shapes of
images in the batch.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Returns:
proposal_boxes: A float tensor with shape
......@@ -909,7 +1156,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
proposal_boxes = tf.squeeze(proposal_boxes, axis=2)
rpn_objectness_softmax_without_background = tf.nn.softmax(
rpn_objectness_predictions_with_background_batch)[:, :, 1]
clip_window = tf.to_float(tf.stack([0, 0, image_shape[1], image_shape[2]]))
clip_window = self._compute_clip_window(image_shapes)
(proposal_boxes, proposal_scores, _, _, _,
num_proposals) = post_processing.batch_multiclass_non_max_suppression(
tf.expand_dims(proposal_boxes, axis=2),
......@@ -924,19 +1171,22 @@ class FasterRCNNMetaArch(model.DetectionModel):
proposal_boxes = tf.stop_gradient(proposal_boxes)
if not self._hard_example_miner:
(groundtruth_boxlists, groundtruth_classes_with_background_list,
_) = self._format_groundtruth_data(image_shape)
_) = self._format_groundtruth_data(true_image_shapes)
(proposal_boxes, proposal_scores,
num_proposals) = self._unpad_proposals_and_sample_box_classifier_batch(
proposal_boxes, proposal_scores, num_proposals,
groundtruth_boxlists, groundtruth_classes_with_background_list)
# normalize proposal boxes
proposal_boxes_reshaped = tf.reshape(proposal_boxes, [-1, 4])
normalized_proposal_boxes_reshaped = box_list_ops.to_normalized_coordinates(
box_list.BoxList(proposal_boxes_reshaped),
image_shape[1], image_shape[2], check_range=False).get()
proposal_boxes = tf.reshape(normalized_proposal_boxes_reshaped,
[-1, proposal_boxes.shape[1].value, 4])
return proposal_boxes, proposal_scores, num_proposals
def normalize_boxes(args):
proposal_boxes_per_image = args[0]
image_shape = args[1]
normalized_boxes_per_image = box_list_ops.to_normalized_coordinates(
box_list.BoxList(proposal_boxes_per_image), image_shape[0],
image_shape[1], check_range=False).get()
return normalized_boxes_per_image
normalized_proposal_boxes = shape_utils.static_or_dynamic_map_fn(
normalize_boxes, elems=[proposal_boxes, image_shapes], dtype=tf.float32)
return normalized_proposal_boxes, proposal_scores, num_proposals
def _unpad_proposals_and_sample_box_classifier_batch(
self,
......@@ -951,7 +1201,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
proposal_boxes: A float tensor with shape
[batch_size, num_proposals, 4] representing the (potentially zero
padded) proposal boxes for all images in the batch. These boxes are
represented as normalized coordinates.
represented in absolute coordinates.
proposal_scores: A float tensor with shape
[batch_size, num_proposals] representing the (potentially zero
padded) proposal objectness scores for all images in the batch.
......@@ -968,7 +1218,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
proposal_boxes: A float tensor with shape
[batch_size, second_stage_batch_size, 4] representing the (potentially
zero padded) proposal boxes for all images in the batch. These boxes
are represented as normalized coordinates.
are represented in absolute coordinates.
proposal_scores: A float tensor with shape
[batch_size, second_stage_batch_size] representing the (potentially zero
padded) proposal objectness scores for all images in the batch.
......@@ -1022,7 +1272,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
tf.stack(single_image_proposal_score_sample),
tf.stack(single_image_num_proposals_sample))
def _format_groundtruth_data(self, image_shape):
def _format_groundtruth_data(self, true_image_shapes):
"""Helper function for preparing groundtruth data for target assignment.
In order to be consistent with the model.DetectionModel interface,
......@@ -1035,8 +1285,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
image_shape.
Args:
image_shape: A 1-D int32 tensor of shape [4] representing the shape of the
input image batch.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Returns:
groundtruth_boxlists: A list of BoxLists containing (absolute) coordinates
......@@ -1050,8 +1302,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
"""
groundtruth_boxlists = [
box_list_ops.to_absolute_coordinates(
box_list.BoxList(boxes), image_shape[1], image_shape[2])
for boxes in self.groundtruth_lists(fields.BoxListFields.boxes)]
box_list.BoxList(boxes), true_image_shapes[i, 0],
true_image_shapes[i, 1])
for i, boxes in enumerate(
self.groundtruth_lists(fields.BoxListFields.boxes))
]
groundtruth_classes_with_background_list = [
tf.to_float(
tf.pad(one_hot_encoding, [[0, 0], [1, 0]], mode='CONSTANT'))
......@@ -1063,12 +1318,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
if groundtruth_masks_list is not None:
resized_masks_list = []
for mask in groundtruth_masks_list:
resized_4d_mask = tf.image.resize_images(
tf.expand_dims(mask, axis=3),
image_shape[1:3],
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
align_corners=True)
resized_masks_list.append(tf.squeeze(resized_4d_mask, axis=3))
_, resized_mask, _ = self._image_resizer_fn(
# Reuse the given `image_resizer_fn` to resize groundtruth masks.
# `mask` tensor for an image is of the shape [num_masks,
# image_height, image_width]. Below we create a dummy image of the
# the shape [image_height, image_width, 1] to use with
# `image_resizer_fn`.
image=tf.zeros(tf.stack([tf.shape(mask)[1], tf.shape(mask)[2], 1])),
masks=mask)
resized_masks_list.append(resized_mask)
groundtruth_masks_list = resized_masks_list
return (groundtruth_boxlists, groundtruth_classes_with_background_list,
......@@ -1152,7 +1411,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
class_predictions_with_background,
proposal_boxes,
num_proposals,
image_shape,
image_shapes,
mask_predictions=None):
"""Converts predictions from the second stage box classifier to detections.
......@@ -1169,7 +1428,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
bounding boxes in absolute coordinates.
num_proposals: a 1-D int32 tensor of shape [batch] representing the number
of proposals predicted for each image in the batch.
image_shape: a 1-D int32 tensor representing the input image shape.
image_shapes: a 2-D int32 tensor containing shapes of input image in the
batch.
mask_predictions: (optional) a 4-D float tensor with shape
[total_num_padded_proposals, num_classes, mask_height, mask_width]
containing instance mask prediction logits.
......@@ -1202,8 +1462,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
tf.slice(class_predictions_with_background_batch,
[0, 0, 1], [-1, -1, -1]),
[-1, self.max_num_proposals, self.num_classes])
clip_window = tf.to_float(tf.stack([0, 0, image_shape[1], image_shape[2]]))
clip_window = self._compute_clip_window(image_shapes)
mask_predictions_batch = None
if mask_predictions is not None:
mask_height = mask_predictions.shape[2].value
......@@ -1220,12 +1479,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
change_coordinate_frame=True,
num_valid_boxes=num_proposals,
masks=mask_predictions_batch)
detections = {'detection_boxes': nmsed_boxes,
'detection_scores': nmsed_scores,
'detection_classes': nmsed_classes,
'num_detections': tf.to_float(num_detections)}
detections = {
fields.DetectionResultFields.detection_boxes: nmsed_boxes,
fields.DetectionResultFields.detection_scores: nmsed_scores,
fields.DetectionResultFields.detection_classes: nmsed_classes,
fields.DetectionResultFields.num_detections: tf.to_float(num_detections)
}
if nmsed_masks is not None:
detections['detection_masks'] = nmsed_masks
detections[fields.DetectionResultFields.detection_masks] = nmsed_masks
return detections
def _batch_decode_boxes(self, box_encodings, anchor_boxes):
......@@ -1257,22 +1518,26 @@ class FasterRCNNMetaArch(model.DetectionModel):
tf.stack([combined_shape[0], combined_shape[1],
num_classes, 4]))
def loss(self, prediction_dict, scope=None):
def loss(self, prediction_dict, true_image_shapes, scope=None):
"""Compute scalar loss tensors given prediction tensors.
If first_stage_only=True, only RPN related losses are computed (i.e.,
If number_of_stages=1, only RPN related losses are computed (i.e.,
`rpn_localization_loss` and `rpn_objectness_loss`). Otherwise all
losses are computed.
Args:
prediction_dict: a dictionary holding prediction tensors (see the
documentation for the predict method. If first_stage_only=True, we
documentation for the predict method. If number_of_stages=1, we
expect prediction_dict to contain `rpn_box_encodings`,
`rpn_objectness_predictions_with_background`, `rpn_features_to_crop`,
`image_shape`, and `anchors` fields. Otherwise we expect
prediction_dict to additionally contain `refined_box_encodings`,
`class_predictions_with_background`, `num_proposals`, and
`proposal_boxes` fields.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
scope: Optional scope name.
Returns:
......@@ -1283,15 +1548,15 @@ class FasterRCNNMetaArch(model.DetectionModel):
"""
with tf.name_scope(scope, 'Loss', prediction_dict.values()):
(groundtruth_boxlists, groundtruth_classes_with_background_list,
groundtruth_masks_list
) = self._format_groundtruth_data(prediction_dict['image_shape'])
groundtruth_masks_list) = self._format_groundtruth_data(
true_image_shapes)
loss_dict = self._loss_rpn(
prediction_dict['rpn_box_encodings'],
prediction_dict['rpn_objectness_predictions_with_background'],
prediction_dict['anchors'],
groundtruth_boxlists,
groundtruth_classes_with_background_list)
if not self._first_stage_only:
if self._number_of_stages > 1:
loss_dict.update(
self._loss_box_classifier(
prediction_dict['refined_box_encodings'],
......@@ -1352,7 +1617,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
return self._first_stage_sampler.subsample(
tf.cast(cls_weights, tf.bool),
self._first_stage_minibatch_size, tf.cast(cls_targets, tf.bool))
batch_sampled_indices = tf.to_float(tf.map_fn(
batch_sampled_indices = tf.to_float(shape_utils.static_or_dynamic_map_fn(
_minibatch_subsample_fn,
[batch_cls_targets, batch_cls_weights],
dtype=tf.bool,
......@@ -1491,10 +1756,13 @@ class FasterRCNNMetaArch(model.DetectionModel):
second_stage_loc_losses = self._second_stage_localization_loss(
reshaped_refined_box_encodings,
batch_reg_targets, weights=batch_reg_weights) / normalizer
second_stage_cls_losses = self._second_stage_classification_loss(
class_predictions_with_background,
batch_cls_targets_with_background,
weights=batch_cls_weights) / normalizer
second_stage_cls_losses = ops.reduce_sum_trailing_dimensions(
self._second_stage_classification_loss(
class_predictions_with_background,
batch_cls_targets_with_background,
weights=batch_cls_weights),
ndims=2) / normalizer
second_stage_loc_loss = tf.reduce_sum(
tf.boolean_mask(second_stage_loc_losses, paddings_indicator))
second_stage_cls_loss = tf.reduce_sum(
......@@ -1522,9 +1790,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
# Create a new target assigner that matches the proposals to groundtruth
# and returns the mask targets.
# TODO: Move `unmatched_cls_target` from constructor to assign function.
# This will enable reuse of a single target assigner for both class
# targets and mask targets.
# TODO: Move `unmatched_cls_target` from constructor to assign
# function. This will enable reuse of a single target assigner for both
# class targets and mask targets.
mask_target_assigner = target_assigner.create_target_assigner(
'FasterRCNN', 'detection',
unmatched_cls_target=tf.zeros(image_shape[1:3], dtype=tf.float32))
......@@ -1566,14 +1834,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
flat_cropped_gt_mask,
[batch_size, -1, mask_height * mask_width])
second_stage_mask_losses = self._second_stage_mask_loss(
reshaped_prediction_masks,
batch_cropped_gt_mask,
weights=batch_mask_target_weights) / (
mask_height * mask_width *
tf.maximum(tf.reduce_sum(batch_mask_target_weights, axis=1,
keep_dims=True),
tf.ones((batch_size, 1))))
second_stage_mask_losses = ops.reduce_sum_trailing_dimensions(
self._second_stage_mask_loss(
reshaped_prediction_masks,
batch_cropped_gt_mask,
weights=batch_mask_target_weights),
ndims=2) / (
mask_height * mask_width * tf.maximum(
tf.reduce_sum(
batch_mask_target_weights, axis=1, keep_dims=True
), tf.ones((batch_size, 1))))
second_stage_mask_loss = tf.reduce_sum(
tf.boolean_mask(second_stage_mask_losses, paddings_indicator))
......@@ -1647,7 +1917,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
cls_losses=tf.expand_dims(single_image_cls_loss, 0),
decoded_boxlist_list=[proposal_boxlist])
def restore_map(self, from_detection_checkpoint=True):
def restore_map(self,
from_detection_checkpoint=True,
load_all_detection_checkpoint_vars=False):
"""Returns a map of variables to load from a foreign checkpoint.
See parent class for details.
......@@ -1655,7 +1927,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
Args:
from_detection_checkpoint: whether to restore from a full detection
checkpoint (with compatible variable names) or to restore from a
classification checkpoint for initialization prior to training.
classification checkpoint for initialization prior to training. Default
True.
load_all_detection_checkpoint_vars: whether to load all variables (when
`from_detection_checkpoint` is True). If False, only variables within
the feature extractor scopes are included. Default False.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
......@@ -1670,8 +1946,12 @@ class FasterRCNNMetaArch(model.DetectionModel):
variables_to_restore.append(slim.get_or_create_global_step())
# Only load feature extractor variables to be consistent with loading from
# a classification checkpoint.
include_patterns = None
if not load_all_detection_checkpoint_vars:
include_patterns = [
self.first_stage_feature_extractor_scope,
self.second_stage_feature_extractor_scope
]
feature_extractor_variables = tf.contrib.framework.filter_variables(
variables_to_restore,
include_patterns=[self.first_stage_feature_extractor_scope,
self.second_stage_feature_extractor_scope])
variables_to_restore, include_patterns=include_patterns)
return {var.op.name: var for var in feature_extractor_variables}
......@@ -26,7 +26,7 @@ class FasterRCNNMetaArchTest(
def test_postprocess_second_stage_only_inference_mode_with_masks(self):
model = self._build_model(
is_training=False, first_stage_only=False, second_stage_batch_size=6)
is_training=False, number_of_stages=2, second_stage_batch_size=6)
batch_size = 2
total_num_padded_proposals = batch_size * model.max_num_proposals
......@@ -61,6 +61,7 @@ class FasterRCNNMetaArchTest(
[[1, 1], [1, 1]],
[[0, 0], [0, 0]]]])
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
detections = model.postprocess({
'refined_box_encodings': refined_box_encodings,
'class_predictions_with_background': class_predictions_with_background,
......@@ -68,7 +69,7 @@ class FasterRCNNMetaArchTest(
'proposal_boxes': proposal_boxes,
'image_shape': image_shape,
'mask_predictions': mask_predictions
})
}, true_image_shapes)
with self.test_session() as sess:
detections_out = sess.run(detections)
self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4])
......@@ -79,6 +80,227 @@ class FasterRCNNMetaArchTest(
self.assertAllClose(detections_out['num_detections'], [5, 4])
self.assertAllClose(detections_out['detection_masks'],
exp_detection_masks)
self.assertTrue(np.amax(detections_out['detection_masks'] <= 1.0))
self.assertTrue(np.amin(detections_out['detection_masks'] >= 0.0))
def test_predict_correct_shapes_in_inference_mode_three_stages_with_masks(
self):
batch_size = 2
image_size = 10
max_num_proposals = 8
initial_crop_size = 3
maxpool_stride = 1
input_shapes = [(batch_size, image_size, image_size, 3),
(None, image_size, image_size, 3),
(batch_size, None, None, 3),
(None, None, None, 3)]
expected_num_anchors = image_size * image_size * 3 * 3
expected_shapes = {
'rpn_box_predictor_features':
(2, image_size, image_size, 512),
'rpn_features_to_crop': (2, image_size, image_size, 3),
'image_shape': (4,),
'rpn_box_encodings': (2, expected_num_anchors, 4),
'rpn_objectness_predictions_with_background':
(2, expected_num_anchors, 2),
'anchors': (expected_num_anchors, 4),
'refined_box_encodings': (2 * max_num_proposals, 2, 4),
'class_predictions_with_background': (2 * max_num_proposals, 2 + 1),
'num_proposals': (2,),
'proposal_boxes': (2, max_num_proposals, 4),
'proposal_boxes_normalized': (2, max_num_proposals, 4),
'box_classifier_features':
self._get_box_classifier_features_shape(image_size,
batch_size,
max_num_proposals,
initial_crop_size,
maxpool_stride,
3)
}
for input_shape in input_shapes:
test_graph = tf.Graph()
with test_graph.as_default():
model = self._build_model(
is_training=False,
number_of_stages=3,
second_stage_batch_size=2,
predict_masks=True)
preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape)
_, true_image_shapes = model.preprocess(preprocessed_inputs)
result_tensor_dict = model.predict(preprocessed_inputs,
true_image_shapes)
init_op = tf.global_variables_initializer()
with self.test_session(graph=test_graph) as sess:
sess.run(init_op)
tensor_dict_out = sess.run(result_tensor_dict, feed_dict={
preprocessed_inputs:
np.zeros((batch_size, image_size, image_size, 3))})
self.assertEqual(
set(tensor_dict_out.keys()),
set(expected_shapes.keys()).union(
set([
'detection_boxes', 'detection_scores', 'detection_classes',
'detection_masks', 'num_detections'
])))
for key in expected_shapes:
self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
self.assertAllEqual(tensor_dict_out['detection_boxes'].shape, [2, 5, 4])
self.assertAllEqual(tensor_dict_out['detection_masks'].shape,
[2, 5, 14, 14])
self.assertAllEqual(tensor_dict_out['detection_classes'].shape, [2, 5])
self.assertAllEqual(tensor_dict_out['detection_scores'].shape, [2, 5])
self.assertAllEqual(tensor_dict_out['num_detections'].shape, [2])
def test_predict_gives_correct_shapes_in_train_mode_both_stages_with_masks(
self):
test_graph = tf.Graph()
with test_graph.as_default():
model = self._build_model(
is_training=True,
number_of_stages=2,
second_stage_batch_size=7,
predict_masks=True)
batch_size = 2
image_size = 10
max_num_proposals = 7
initial_crop_size = 3
maxpool_stride = 1
image_shape = (batch_size, image_size, image_size, 3)
preprocessed_inputs = tf.zeros(image_shape, dtype=tf.float32)
groundtruth_boxes_list = [
tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32),
tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)
]
groundtruth_classes_list = [
tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
tf.constant([[1, 0], [1, 0]], dtype=tf.float32)
]
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
result_tensor_dict = model.predict(preprocessed_inputs, true_image_shapes)
expected_shapes = {
'rpn_box_predictor_features': (2, image_size, image_size, 512),
'rpn_features_to_crop': (2, image_size, image_size, 3),
'image_shape': (4,),
'refined_box_encodings': (2 * max_num_proposals, 2, 4),
'class_predictions_with_background': (2 * max_num_proposals, 2 + 1),
'num_proposals': (2,),
'proposal_boxes': (2, max_num_proposals, 4),
'proposal_boxes_normalized': (2, max_num_proposals, 4),
'box_classifier_features':
self._get_box_classifier_features_shape(
image_size, batch_size, max_num_proposals, initial_crop_size,
maxpool_stride, 3),
'mask_predictions': (2 * max_num_proposals, 2, 14, 14)
}
init_op = tf.global_variables_initializer()
with self.test_session(graph=test_graph) as sess:
sess.run(init_op)
tensor_dict_out = sess.run(result_tensor_dict)
self.assertEqual(
set(tensor_dict_out.keys()),
set(expected_shapes.keys()).union(
set([
'rpn_box_encodings',
'rpn_objectness_predictions_with_background',
'anchors',
])))
for key in expected_shapes:
self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
anchors_shape_out = tensor_dict_out['anchors'].shape
self.assertEqual(2, len(anchors_shape_out))
self.assertEqual(4, anchors_shape_out[1])
num_anchors_out = anchors_shape_out[0]
self.assertAllEqual(tensor_dict_out['rpn_box_encodings'].shape,
(2, num_anchors_out, 4))
self.assertAllEqual(
tensor_dict_out['rpn_objectness_predictions_with_background'].shape,
(2, num_anchors_out, 2))
def test_postprocess_third_stage_only_inference_mode(self):
num_proposals_shapes = [(2), (None)]
refined_box_encodings_shapes = [(16, 2, 4), (None, 2, 4)]
class_predictions_with_background_shapes = [(16, 3), (None, 3)]
proposal_boxes_shapes = [(2, 8, 4), (None, 8, 4)]
batch_size = 2
image_shape = np.array((2, 36, 48, 3), dtype=np.int32)
for (num_proposals_shape, refined_box_encoding_shape,
class_predictions_with_background_shape,
proposal_boxes_shape) in zip(num_proposals_shapes,
refined_box_encodings_shapes,
class_predictions_with_background_shapes,
proposal_boxes_shapes):
tf_graph = tf.Graph()
with tf_graph.as_default():
model = self._build_model(
is_training=False, number_of_stages=3,
second_stage_batch_size=6, predict_masks=True)
total_num_padded_proposals = batch_size * model.max_num_proposals
proposal_boxes = np.array(
[[[1, 1, 2, 3],
[0, 0, 1, 1],
[.5, .5, .6, .6],
4*[0], 4*[0], 4*[0], 4*[0], 4*[0]],
[[2, 3, 6, 8],
[1, 2, 5, 3],
4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]])
num_proposals = np.array([3, 2], dtype=np.int32)
refined_box_encodings = np.zeros(
[total_num_padded_proposals, model.num_classes, 4])
class_predictions_with_background = np.ones(
[total_num_padded_proposals, model.num_classes+1])
num_proposals_placeholder = tf.placeholder(tf.int32,
shape=num_proposals_shape)
refined_box_encodings_placeholder = tf.placeholder(
tf.float32, shape=refined_box_encoding_shape)
class_predictions_with_background_placeholder = tf.placeholder(
tf.float32, shape=class_predictions_with_background_shape)
proposal_boxes_placeholder = tf.placeholder(
tf.float32, shape=proposal_boxes_shape)
image_shape_placeholder = tf.placeholder(tf.int32, shape=(4))
_, true_image_shapes = model.preprocess(
tf.zeros(image_shape_placeholder))
detections = model.postprocess({
'refined_box_encodings': refined_box_encodings_placeholder,
'class_predictions_with_background':
class_predictions_with_background_placeholder,
'num_proposals': num_proposals_placeholder,
'proposal_boxes': proposal_boxes_placeholder,
'image_shape': image_shape_placeholder,
'detection_boxes': tf.zeros([2, 5, 4]),
'detection_masks': tf.zeros([2, 5, 14, 14]),
'detection_scores': tf.zeros([2, 5]),
'detection_classes': tf.zeros([2, 5]),
'num_detections': tf.zeros([2]),
}, true_image_shapes)
with self.test_session(graph=tf_graph) as sess:
detections_out = sess.run(
detections,
feed_dict={
refined_box_encodings_placeholder: refined_box_encodings,
class_predictions_with_background_placeholder:
class_predictions_with_background,
num_proposals_placeholder: num_proposals,
proposal_boxes_placeholder: proposal_boxes,
image_shape_placeholder: image_shape
})
self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4])
self.assertAllEqual(detections_out['detection_masks'].shape,
[2, 5, 14, 14])
self.assertAllClose(detections_out['detection_scores'].shape, [2, 5])
self.assertAllClose(detections_out['detection_classes'].shape, [2, 5])
self.assertAllClose(detections_out['num_detections'].shape, [2])
self.assertTrue(np.amax(detections_out['detection_masks'] <= 1.0))
self.assertTrue(np.amin(detections_out['detection_masks'] >= 0.0))
def _get_box_classifier_features_shape(self,
image_size,
......
......@@ -89,10 +89,39 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
"""
return box_predictor_text_proto
def _get_second_stage_box_predictor(self, num_classes, is_training):
def _add_mask_to_second_stage_box_predictor_text_proto(self):
box_predictor_text_proto = """
mask_rcnn_box_predictor {
predict_instance_masks: true
mask_height: 14
mask_width: 14
conv_hyperparams {
op: CONV
regularizer {
l2_regularizer {
weight: 0.0
}
}
initializer {
truncated_normal_initializer {
stddev: 0.01
}
}
}
}
"""
return box_predictor_text_proto
def _get_second_stage_box_predictor(self, num_classes, is_training,
predict_masks):
box_predictor_proto = box_predictor_pb2.BoxPredictor()
text_format.Merge(self._get_second_stage_box_predictor_text_proto(),
box_predictor_proto)
if predict_masks:
text_format.Merge(
self._add_mask_to_second_stage_box_predictor_text_proto(),
box_predictor_proto)
return box_predictor_builder.build(
hyperparams_builder.build,
box_predictor_proto,
......@@ -109,15 +138,36 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
def _build_model(self,
is_training,
first_stage_only,
number_of_stages,
second_stage_batch_size,
first_stage_max_proposals=8,
num_classes=2,
hard_mining=False,
softmax_second_stage_classification_loss=True):
def image_resizer_fn(image):
return tf.identity(image)
softmax_second_stage_classification_loss=True,
predict_masks=False,
pad_to_max_dimension=None):
def image_resizer_fn(image, masks=None):
"""Fake image resizer function."""
resized_inputs = []
resized_image = tf.identity(image)
if pad_to_max_dimension is not None:
resized_image = tf.image.pad_to_bounding_box(image, 0, 0,
pad_to_max_dimension,
pad_to_max_dimension)
resized_inputs.append(resized_image)
if masks is not None:
resized_masks = tf.identity(masks)
if pad_to_max_dimension is not None:
resized_masks = tf.image.pad_to_bounding_box(tf.transpose(masks,
[1, 2, 0]),
0, 0,
pad_to_max_dimension,
pad_to_max_dimension)
resized_masks = tf.transpose(resized_masks, [2, 0, 1])
resized_inputs.append(resized_masks)
resized_inputs.append(tf.shape(image))
return resized_inputs
# anchors in this test are designed so that a subset of anchors are inside
# the image and a subset of anchors are outside.
......@@ -181,10 +231,10 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
second_stage_classification_loss_weight = 1.0
if softmax_second_stage_classification_loss:
second_stage_classification_loss = (
losses.WeightedSoftmaxClassificationLoss(anchorwise_output=True))
losses.WeightedSoftmaxClassificationLoss())
else:
second_stage_classification_loss = (
losses.WeightedSigmoidClassificationLoss(anchorwise_output=True))
losses.WeightedSigmoidClassificationLoss())
hard_example_miner = None
if hard_mining:
......@@ -201,7 +251,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'num_classes': num_classes,
'image_resizer_fn': image_resizer_fn,
'feature_extractor': fake_feature_extractor,
'first_stage_only': first_stage_only,
'number_of_stages': number_of_stages,
'first_stage_anchor_generator': first_stage_anchor_generator,
'first_stage_atrous_rate': first_stage_atrous_rate,
'first_stage_box_predictor_arg_scope':
......@@ -232,23 +282,27 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
second_stage_classification_loss,
'hard_example_miner': hard_example_miner}
return self._get_model(self._get_second_stage_box_predictor(
num_classes=num_classes, is_training=is_training), **common_kwargs)
return self._get_model(
self._get_second_stage_box_predictor(
num_classes=num_classes,
is_training=is_training,
predict_masks=predict_masks), **common_kwargs)
def test_predict_gives_correct_shapes_in_inference_mode_first_stage_only(
self):
test_graph = tf.Graph()
with test_graph.as_default():
model = self._build_model(
is_training=False, first_stage_only=True, second_stage_batch_size=2)
is_training=False, number_of_stages=1, second_stage_batch_size=2)
batch_size = 2
height = 10
width = 12
input_image_shape = (batch_size, height, width, 3)
preprocessed_inputs = tf.placeholder(dtype=tf.float32,
shape=(batch_size, None, None, 3))
prediction_dict = model.predict(preprocessed_inputs)
_, true_image_shapes = model.preprocess(tf.zeros(input_image_shape))
preprocessed_inputs = tf.placeholder(
dtype=tf.float32, shape=(batch_size, None, None, 3))
prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
# In inference mode, anchors are clipped to the image window, but not
# pruned. Since MockFasterRCNN.extract_proposal_features returns a
......@@ -269,7 +323,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
}
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
with self.test_session(graph=test_graph) as sess:
sess.run(init_op)
prediction_out = sess.run(prediction_dict,
feed_dict={
......@@ -295,14 +349,15 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
test_graph = tf.Graph()
with test_graph.as_default():
model = self._build_model(
is_training=True, first_stage_only=True, second_stage_batch_size=2)
is_training=True, number_of_stages=1, second_stage_batch_size=2)
batch_size = 2
height = 10
width = 12
input_image_shape = (batch_size, height, width, 3)
preprocessed_inputs = tf.placeholder(dtype=tf.float32,
shape=(batch_size, None, None, 3))
prediction_dict = model.predict(preprocessed_inputs)
_, true_image_shapes = model.preprocess(tf.zeros(input_image_shape))
preprocessed_inputs = tf.placeholder(
dtype=tf.float32, shape=(batch_size, None, None, 3))
prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
expected_output_keys = set([
'rpn_box_predictor_features', 'rpn_features_to_crop', 'image_shape',
......@@ -314,7 +369,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
num_anchors_strict_upper_bound = height * width * 3 * 3
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
with self.test_session(graph=test_graph) as sess:
sess.run(init_op)
prediction_out = sess.run(prediction_dict,
feed_dict={
......@@ -344,8 +399,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
prediction_out['rpn_objectness_predictions_with_background'].shape,
(batch_size, num_anchors_out, 2))
def test_predict_correct_shapes_in_inference_mode_both_stages(
self):
def test_predict_correct_shapes_in_inference_mode_two_stages(self):
batch_size = 2
image_size = 10
max_num_proposals = 8
......@@ -384,10 +438,14 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
test_graph = tf.Graph()
with test_graph.as_default():
model = self._build_model(
is_training=False, first_stage_only=False,
second_stage_batch_size=2)
is_training=False,
number_of_stages=2,
second_stage_batch_size=2,
predict_masks=False)
preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape)
result_tensor_dict = model.predict(preprocessed_inputs)
_, true_image_shapes = model.preprocess(preprocessed_inputs)
result_tensor_dict = model.predict(
preprocessed_inputs, true_image_shapes)
init_op = tf.global_variables_initializer()
with self.test_session(graph=test_graph) as sess:
sess.run(init_op)
......@@ -403,7 +461,10 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
test_graph = tf.Graph()
with test_graph.as_default():
model = self._build_model(
is_training=True, first_stage_only=False, second_stage_batch_size=7)
is_training=True,
number_of_stages=2,
second_stage_batch_size=7,
predict_masks=False)
batch_size = 2
image_size = 10
......@@ -420,10 +481,11 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
tf.constant([[1, 0], [1, 0]], dtype=tf.float32)]
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
result_tensor_dict = model.predict(preprocessed_inputs)
result_tensor_dict = model.predict(preprocessed_inputs, true_image_shapes)
expected_shapes = {
'rpn_box_predictor_features':
(2, image_size, image_size, 512),
......@@ -444,7 +506,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
}
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
with self.test_session(graph=test_graph) as sess:
sess.run(init_op)
tensor_dict_out = sess.run(result_tensor_dict)
self.assertEqual(set(tensor_dict_out.keys()),
......@@ -465,9 +527,11 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
tensor_dict_out['rpn_objectness_predictions_with_background'].shape,
(2, num_anchors_out, 2))
def test_postprocess_first_stage_only_inference_mode(self):
def _test_postprocess_first_stage_only_inference_mode(
self, pad_to_max_dimension=None):
model = self._build_model(
is_training=False, first_stage_only=True, second_stage_batch_size=6)
is_training=False, number_of_stages=1, second_stage_batch_size=6,
pad_to_max_dimension=pad_to_max_dimension)
batch_size = 2
anchors = tf.constant(
[[0, 0, 16, 16],
......@@ -490,13 +554,13 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
[10, -11]]], dtype=tf.float32)
rpn_features_to_crop = tf.ones((batch_size, 8, 8, 10), dtype=tf.float32)
image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32)
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
proposals = model.postprocess({
'rpn_box_encodings': rpn_box_encodings,
'rpn_objectness_predictions_with_background':
rpn_objectness_predictions_with_background,
'rpn_features_to_crop': rpn_features_to_crop,
'anchors': anchors,
'image_shape': image_shape})
'anchors': anchors}, true_image_shapes)
expected_proposal_boxes = [
[[0, 0, .5, .5], [.5, .5, 1, 1], [0, .5, .5, 1], [.5, 0, 1.0, .5]]
+ 4 * [4 * [0]],
......@@ -518,9 +582,18 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
self.assertAllEqual(proposals_out['num_detections'],
expected_num_proposals)
def test_postprocess_first_stage_only_train_mode(self):
def test_postprocess_first_stage_only_inference_mode(self):
self._test_postprocess_first_stage_only_inference_mode()
def test_postprocess_first_stage_only_inference_mode_padded_image(self):
self._test_postprocess_first_stage_only_inference_mode(
pad_to_max_dimension=56)
def _test_postprocess_first_stage_only_train_mode(self,
pad_to_max_dimension=None):
model = self._build_model(
is_training=True, first_stage_only=True, second_stage_batch_size=2)
is_training=True, number_of_stages=1, second_stage_batch_size=2,
pad_to_max_dimension=pad_to_max_dimension)
batch_size = 2
anchors = tf.constant(
[[0, 0, 16, 16],
......@@ -549,6 +622,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
tf.constant([[1, 0], [1, 0]], dtype=tf.float32)]
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
proposals = model.postprocess({
......@@ -556,8 +630,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'rpn_objectness_predictions_with_background':
rpn_objectness_predictions_with_background,
'rpn_features_to_crop': rpn_features_to_crop,
'anchors': anchors,
'image_shape': image_shape})
'anchors': anchors}, true_image_shapes)
expected_proposal_boxes = [
[[0, 0, .5, .5], [.5, .5, 1, 1]], [[0, .5, .5, 1], [.5, 0, 1, .5]]]
expected_proposal_scores = [[1, 1],
......@@ -577,8 +650,15 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
self.assertAllEqual(proposals_out['num_detections'],
expected_num_proposals)
def test_postprocess_second_stage_only_inference_mode(self):
num_proposals_shapes = [(2), (None)]
def test_postprocess_first_stage_only_train_mode(self):
self._test_postprocess_first_stage_only_train_mode()
def test_postprocess_first_stage_only_train_mode_padded_image(self):
self._test_postprocess_first_stage_only_train_mode(pad_to_max_dimension=56)
def _test_postprocess_second_stage_only_inference_mode(
self, pad_to_max_dimension=None):
num_proposals_shapes = [(2), (None,)]
refined_box_encodings_shapes = [(16, 2, 4), (None, 2, 4)]
class_predictions_with_background_shapes = [(16, 3), (None, 3)]
proposal_boxes_shapes = [(2, 8, 4), (None, 8, 4)]
......@@ -593,8 +673,10 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
tf_graph = tf.Graph()
with tf_graph.as_default():
model = self._build_model(
is_training=False, first_stage_only=False,
second_stage_batch_size=6)
is_training=False, number_of_stages=2,
second_stage_batch_size=6,
pad_to_max_dimension=pad_to_max_dimension)
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
total_num_padded_proposals = batch_size * model.max_num_proposals
proposal_boxes = np.array(
[[[1, 1, 2, 3],
......@@ -626,8 +708,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
class_predictions_with_background_placeholder,
'num_proposals': num_proposals_placeholder,
'proposal_boxes': proposal_boxes_placeholder,
'image_shape': image_shape_placeholder,
})
}, true_image_shapes)
with self.test_session(graph=tf_graph) as sess:
detections_out = sess.run(
detections,
......@@ -646,21 +727,28 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
[[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]])
self.assertAllClose(detections_out['num_detections'], [5, 4])
def test_postprocess_second_stage_only_inference_mode(self):
self._test_postprocess_second_stage_only_inference_mode()
def test_postprocess_second_stage_only_inference_mode_padded_image(self):
self._test_postprocess_second_stage_only_inference_mode(
pad_to_max_dimension=56)
def test_preprocess_preserves_input_shapes(self):
image_shapes = [(3, None, None, 3),
(None, 10, 10, 3),
(None, None, None, 3)]
for image_shape in image_shapes:
model = self._build_model(
is_training=False, first_stage_only=False, second_stage_batch_size=6)
is_training=False, number_of_stages=2, second_stage_batch_size=6)
image_placeholder = tf.placeholder(tf.float32, shape=image_shape)
preprocessed_inputs = model.preprocess(image_placeholder)
preprocessed_inputs, _ = model.preprocess(image_placeholder)
self.assertAllEqual(preprocessed_inputs.shape.as_list(), image_shape)
# TODO: Split test into two - with and without masks.
def test_loss_first_stage_only_mode(self):
model = self._build_model(
is_training=True, first_stage_only=True, second_stage_batch_size=6)
is_training=True, number_of_stages=1, second_stage_batch_size=6)
batch_size = 2
anchors = tf.constant(
[[0, 0, 16, 16],
......@@ -698,9 +786,10 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'image_shape': image_shape,
'anchors': anchors
}
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
loss_dict = model.loss(prediction_dict)
loss_dict = model.loss(prediction_dict, true_image_shapes)
with self.test_session() as sess:
loss_dict_out = sess.run(loss_dict)
self.assertAllClose(loss_dict_out['first_stage_localization_loss'], 0)
......@@ -711,7 +800,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
# TODO: Split test into two - with and without masks.
def test_loss_full(self):
model = self._build_model(
is_training=True, first_stage_only=False, second_stage_batch_size=6)
is_training=True, number_of_stages=2, second_stage_batch_size=6)
batch_size = 2
anchors = tf.constant(
[[0, 0, 16, 16],
......@@ -793,10 +882,11 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'num_proposals': num_proposals,
'mask_predictions': mask_predictions_logits
}
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list,
groundtruth_masks_list)
loss_dict = model.loss(prediction_dict)
loss_dict = model.loss(prediction_dict, true_image_shapes)
with self.test_session() as sess:
loss_dict_out = sess.run(loss_dict)
......@@ -808,7 +898,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
def test_loss_full_zero_padded_proposals(self):
model = self._build_model(
is_training=True, first_stage_only=False, second_stage_batch_size=6)
is_training=True, number_of_stages=2, second_stage_batch_size=6)
batch_size = 1
anchors = tf.constant(
[[0, 0, 16, 16],
......@@ -880,10 +970,11 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'num_proposals': num_proposals,
'mask_predictions': mask_predictions_logits
}
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list,
groundtruth_masks_list)
loss_dict = model.loss(prediction_dict)
loss_dict = model.loss(prediction_dict, true_image_shapes)
with self.test_session() as sess:
loss_dict_out = sess.run(loss_dict)
......@@ -895,7 +986,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
def test_loss_full_multiple_label_groundtruth(self):
model = self._build_model(
is_training=True, first_stage_only=False, second_stage_batch_size=6,
is_training=True, number_of_stages=2, second_stage_batch_size=6,
softmax_second_stage_classification_loss=False)
batch_size = 1
anchors = tf.constant(
......@@ -975,10 +1066,11 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'num_proposals': num_proposals,
'mask_predictions': mask_predictions_logits
}
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list,
groundtruth_masks_list)
loss_dict = model.loss(prediction_dict)
loss_dict = model.loss(prediction_dict, true_image_shapes)
with self.test_session() as sess:
loss_dict_out = sess.run(loss_dict)
......@@ -990,7 +1082,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
def test_loss_full_zero_padded_proposals_nonzero_loss_with_two_images(self):
model = self._build_model(
is_training=True, first_stage_only=False, second_stage_batch_size=6)
is_training=True, number_of_stages=2, second_stage_batch_size=6)
batch_size = 2
anchors = tf.constant(
[[0, 0, 16, 16],
......@@ -1074,9 +1166,10 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'proposal_boxes': proposal_boxes,
'num_proposals': num_proposals
}
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
loss_dict = model.loss(prediction_dict)
loss_dict = model.loss(prediction_dict, true_image_shapes)
with self.test_session() as sess:
loss_dict_out = sess.run(loss_dict)
......@@ -1089,7 +1182,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
def test_loss_with_hard_mining(self):
model = self._build_model(is_training=True,
first_stage_only=False,
number_of_stages=2,
second_stage_batch_size=None,
first_stage_max_proposals=6,
hard_mining=True)
......@@ -1163,9 +1256,10 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
'proposal_boxes': proposal_boxes,
'num_proposals': num_proposals
}
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
loss_dict = model.loss(prediction_dict)
loss_dict = model.loss(prediction_dict, true_image_shapes)
with self.test_session() as sess:
loss_dict_out = sess.run(loss_dict)
......@@ -1185,7 +1279,7 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
init_op = tf.global_variables_initializer()
saver = tf.train.Saver()
save_path = self.get_temp_dir()
with self.test_session() as sess:
with self.test_session(graph=test_graph_classification) as sess:
sess.run(init_op)
saved_model_path = saver.save(sess, save_path)
......@@ -1194,64 +1288,89 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
test_graph_detection = tf.Graph()
with test_graph_detection.as_default():
model = self._build_model(
is_training=False, first_stage_only=False, second_stage_batch_size=6)
is_training=False, number_of_stages=2, second_stage_batch_size=6)
inputs_shape = (2, 20, 20, 3)
inputs = tf.to_float(tf.random_uniform(
inputs_shape, minval=0, maxval=255, dtype=tf.int32))
preprocessed_inputs = model.preprocess(inputs)
prediction_dict = model.predict(preprocessed_inputs)
model.postprocess(prediction_dict)
preprocessed_inputs, true_image_shapes = model.preprocess(inputs)
prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
model.postprocess(prediction_dict, true_image_shapes)
var_map = model.restore_map(from_detection_checkpoint=False)
self.assertIsInstance(var_map, dict)
saver = tf.train.Saver(var_map)
with self.test_session() as sess:
with self.test_session(graph=test_graph_classification) as sess:
saver.restore(sess, saved_model_path)
for var in sess.run(tf.report_uninitialized_variables()):
self.assertNotIn(model.first_stage_feature_extractor_scope, var.name)
self.assertNotIn(model.second_stage_feature_extractor_scope,
var.name)
self.assertNotIn(model.first_stage_feature_extractor_scope, var)
self.assertNotIn(model.second_stage_feature_extractor_scope, var)
def test_restore_map_for_detection_ckpt(self):
# Define first detection graph and save variables.
test_graph_detection1 = tf.Graph()
with test_graph_detection1.as_default():
model = self._build_model(
is_training=False, first_stage_only=False, second_stage_batch_size=6)
is_training=False, number_of_stages=2, second_stage_batch_size=6)
inputs_shape = (2, 20, 20, 3)
inputs = tf.to_float(tf.random_uniform(
inputs_shape, minval=0, maxval=255, dtype=tf.int32))
preprocessed_inputs = model.preprocess(inputs)
prediction_dict = model.predict(preprocessed_inputs)
model.postprocess(prediction_dict)
preprocessed_inputs, true_image_shapes = model.preprocess(inputs)
prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
model.postprocess(prediction_dict, true_image_shapes)
another_variable = tf.Variable([17.0], name='another_variable') # pylint: disable=unused-variable
init_op = tf.global_variables_initializer()
saver = tf.train.Saver()
save_path = self.get_temp_dir()
with self.test_session() as sess:
with self.test_session(graph=test_graph_detection1) as sess:
sess.run(init_op)
saved_model_path = saver.save(sess, save_path)
# Define second detection graph and restore variables.
test_graph_detection2 = tf.Graph()
with test_graph_detection2.as_default():
model2 = self._build_model(is_training=False, first_stage_only=False,
model2 = self._build_model(is_training=False, number_of_stages=2,
second_stage_batch_size=6, num_classes=42)
inputs_shape2 = (2, 20, 20, 3)
inputs2 = tf.to_float(tf.random_uniform(
inputs_shape2, minval=0, maxval=255, dtype=tf.int32))
preprocessed_inputs2 = model2.preprocess(inputs2)
prediction_dict2 = model2.predict(preprocessed_inputs2)
model2.postprocess(prediction_dict2)
preprocessed_inputs2, true_image_shapes = model2.preprocess(inputs2)
prediction_dict2 = model2.predict(preprocessed_inputs2, true_image_shapes)
model2.postprocess(prediction_dict2, true_image_shapes)
another_variable = tf.Variable([17.0], name='another_variable') # pylint: disable=unused-variable
var_map = model2.restore_map(from_detection_checkpoint=True)
self.assertIsInstance(var_map, dict)
saver = tf.train.Saver(var_map)
with self.test_session() as sess:
with self.test_session(graph=test_graph_detection2) as sess:
saver.restore(sess, saved_model_path)
for var in sess.run(tf.report_uninitialized_variables()):
self.assertNotIn(model2.first_stage_feature_extractor_scope, var.name)
self.assertNotIn(model2.second_stage_feature_extractor_scope,
var.name)
uninitialized_vars_list = sess.run(tf.report_uninitialized_variables())
self.assertIn('another_variable', uninitialized_vars_list)
for var in uninitialized_vars_list:
self.assertNotIn(model2.first_stage_feature_extractor_scope, var)
self.assertNotIn(model2.second_stage_feature_extractor_scope, var)
def test_load_all_det_checkpoint_vars(self):
test_graph_detection = tf.Graph()
with test_graph_detection.as_default():
model = self._build_model(
is_training=False,
number_of_stages=2,
second_stage_batch_size=6,
num_classes=42)
inputs_shape = (2, 20, 20, 3)
inputs = tf.to_float(
tf.random_uniform(inputs_shape, minval=0, maxval=255, dtype=tf.int32))
preprocessed_inputs, true_image_shapes = model.preprocess(inputs)
prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
model.postprocess(prediction_dict, true_image_shapes)
another_variable = tf.Variable([17.0], name='another_variable') # pylint: disable=unused-variable
var_map = model.restore_map(
from_detection_checkpoint=True,
load_all_detection_checkpoint_vars=True)
self.assertIsInstance(var_map, dict)
self.assertIn('another_variable', var_map)
if __name__ == '__main__':
tf.test.main()
......@@ -21,8 +21,8 @@ The R-FCN meta architecture is similar to Faster R-CNN and only differs in the
second stage. Hence this class inherits FasterRCNNMetaArch and overrides only
the `_predict_second_stage` method.
Similar to Faster R-CNN we allow for two modes: first_stage_only=True and
first_stage_only=False. In the former setting, all of the user facing methods
Similar to Faster R-CNN we allow for two modes: number_of_stages=1 and
number_of_stages=2. In the former setting, all of the user facing methods
(e.g., predict, postprocess, loss) can be used as if the model consisted
only of the RPN, returning class agnostic proposals (these can be thought of as
approximate detections with no associated class information). In the latter
......@@ -53,7 +53,7 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
num_classes,
image_resizer_fn,
feature_extractor,
first_stage_only,
number_of_stages,
first_stage_anchor_generator,
first_stage_atrous_rate,
first_stage_box_predictor_arg_scope,
......@@ -90,8 +90,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
returns a rank-3 image tensor, possibly with new spatial dimensions.
See builders/image_resizer_builder.py.
feature_extractor: A FasterRCNNFeatureExtractor object.
first_stage_only: Whether to construct only the Region Proposal Network
(RPN) part of the model.
number_of_stages: Valid values are {1, 2}. If 1 will only construct the
Region Proposal Network (RPN) part of the model.
first_stage_anchor_generator: An anchor_generator.AnchorGenerator object
(note that currently we only support
grid_anchor_generator.GridAnchorGenerator objects)
......@@ -165,7 +165,7 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
num_classes,
image_resizer_fn,
feature_extractor,
first_stage_only,
number_of_stages,
first_stage_anchor_generator,
first_stage_atrous_rate,
first_stage_box_predictor_arg_scope,
......@@ -199,14 +199,15 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
rpn_objectness_predictions_with_background,
rpn_features,
anchors,
image_shape):
"""Predicts the output tensors from 2nd stage of FasterRCNN.
image_shape,
true_image_shapes):
"""Predicts the output tensors from 2nd stage of R-FCN.
Args:
rpn_box_encodings: 4-D float tensor of shape
rpn_box_encodings: 3-D float tensor of shape
[batch_size, num_valid_anchors, self._box_coder.code_size] containing
predicted boxes.
rpn_objectness_predictions_with_background: 2-D float tensor of shape
rpn_objectness_predictions_with_background: 3-D float tensor of shape
[batch_size, num_valid_anchors, 2] containing class
predictions (logits) for each of the anchors. Note that this
tensor *includes* background class predictions (at class index 0).
......@@ -216,6 +217,10 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
anchors: 2-D float tensor of shape
[num_anchors, self._box_coder.code_size].
image_shape: A 1D int32 tensors of size [4] containing the image shape.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
......@@ -223,7 +228,7 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
[total_num_proposals, num_classes, 4] representing predicted
(final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals
2) class_predictions_with_background: a 3-D tensor with shape
2) class_predictions_with_background: a 2-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
total_num_proposals=batch_size*self._max_num_proposals.
......@@ -247,9 +252,11 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
[batch_size, feature_map_height, feature_map_width, depth],
representing the box classifier features.
"""
image_shape_2d = tf.tile(tf.expand_dims(image_shape[1:], 0),
[image_shape[0], 1])
proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn(
rpn_box_encodings, rpn_objectness_predictions_with_background,
anchors, image_shape)
anchors, image_shape_2d, true_image_shapes)
box_classifier_features = (
self._feature_extractor.extract_box_classifier_features(
......@@ -257,8 +264,8 @@ class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
scope=self.second_stage_feature_extractor_scope))
box_predictions = self._rfcn_box_predictor.predict(
box_classifier_features,
num_predictions_per_location=1,
[box_classifier_features],
num_predictions_per_location=[1],
scope=self.second_stage_box_predictor_scope,
proposal_boxes=proposal_boxes_normalized)
refined_box_encodings = tf.squeeze(
......
......@@ -23,10 +23,10 @@ import re
import tensorflow as tf
from object_detection.core import box_list
from object_detection.core import box_predictor as bpredictor
from object_detection.core import model
from object_detection.core import standard_fields as fields
from object_detection.core import target_assigner
from object_detection.utils import ops
from object_detection.utils import shape_utils
from object_detection.utils import visualization_utils
......@@ -43,7 +43,8 @@ class SSDFeatureExtractor(object):
pad_to_multiple,
conv_hyperparams,
batch_norm_trainable=True,
reuse_weights=None):
reuse_weights=None,
use_explicit_padding=False):
"""Constructor.
Args:
......@@ -58,6 +59,8 @@ class SSDFeatureExtractor(object):
(e.g. 1), it is desirable to disable batch norm update and use
pretrained batch norm params.
reuse_weights: whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
"""
self._is_training = is_training
self._depth_multiplier = depth_multiplier
......@@ -66,6 +69,7 @@ class SSDFeatureExtractor(object):
self._conv_hyperparams = conv_hyperparams
self._batch_norm_trainable = batch_norm_trainable
self._reuse_weights = reuse_weights
self._use_explicit_padding = use_explicit_padding
@abstractmethod
def preprocess(self, resized_inputs):
......@@ -78,6 +82,10 @@ class SSDFeatureExtractor(object):
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
"""
pass
......@@ -122,9 +130,9 @@ class SSDMetaArch(model.DetectionModel):
add_summaries=True):
"""SSDMetaArch Constructor.
TODO: group NMS parameters + score converter into a class and loss
parameters into a class and write config protos for postprocessing
and losses.
TODO(rathodv,jonathanhuang): group NMS parameters + score converter into
a class and loss parameters into a class and write config protos for
postprocessing and losses.
Args:
is_training: A boolean indicating whether the training version of the
......@@ -138,7 +146,9 @@ class SSDMetaArch(model.DetectionModel):
region_similarity_calculator.RegionSimilarityCalculator object.
image_resizer_fn: a callable for image resizing. This callable always
takes a rank-3 image tensor (corresponding to a single image) and
returns a rank-3 image tensor, possibly with new spatial dimensions.
returns a rank-3 image tensor, possibly with new spatial dimensions and
a 1-D tensor of shape [3] indicating shape of true image within
the resized image tensor as the resized image tensor could be padded.
See builders/image_resizer_builder.py.
non_max_suppression_fn: batch_multiclass_non_max_suppression
callable that takes `boxes`, `scores` and optional `clip_window`
......@@ -174,14 +184,14 @@ class SSDMetaArch(model.DetectionModel):
self._matcher = matcher
self._region_similarity_calculator = region_similarity_calculator
# TODO: handle agnostic mode and positive/negative class weights
# TODO: handle agnostic mode and positive/negative class
# weights
unmatched_cls_target = None
unmatched_cls_target = tf.constant([1] + self.num_classes * [0], tf.float32)
self._target_assigner = target_assigner.TargetAssigner(
self._region_similarity_calculator,
self._matcher,
self._box_coder,
positive_class_weight=1.0,
negative_class_weight=1.0,
unmatched_cls_target=unmatched_cls_target)
......@@ -210,7 +220,9 @@ class SSDMetaArch(model.DetectionModel):
def preprocess(self, inputs):
"""Feature-extractor specific preprocessing.
See base class.
SSD meta architecture uses a default clip_window of [0, 0, 1, 1] during
post-processing. On calling `preprocess` method, clip_window gets updated
based on `true_image_shapes` returned by `image_resizer_fn`.
Args:
inputs: a [batch, height_in, width_in, channels] float tensor representing
......@@ -219,20 +231,69 @@ class SSDMetaArch(model.DetectionModel):
Returns:
preprocessed_inputs: a [batch, height_out, width_out, channels] float
tensor representing a batch of images.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Raises:
ValueError: if inputs tensor does not have type tf.float32
"""
if inputs.dtype is not tf.float32:
raise ValueError('`preprocess` expects a tf.float32 tensor')
with tf.name_scope('Preprocessor'):
# TODO: revisit whether to always use batch size as the number of parallel
# iterations vs allow for dynamic batching.
resized_inputs = tf.map_fn(self._image_resizer_fn,
elems=inputs,
dtype=tf.float32)
return self._feature_extractor.preprocess(resized_inputs)
def predict(self, preprocessed_inputs):
# TODO: revisit whether to always use batch size as
# the number of parallel iterations vs allow for dynamic batching.
outputs = shape_utils.static_or_dynamic_map_fn(
self._image_resizer_fn,
elems=inputs,
dtype=[tf.float32, tf.int32])
resized_inputs = outputs[0]
true_image_shapes = outputs[1]
return (self._feature_extractor.preprocess(resized_inputs),
true_image_shapes)
def _compute_clip_window(self, preprocessed_images, true_image_shapes):
"""Computes clip window to use during post_processing.
Computes a new clip window to use during post-processing based on
`resized_image_shapes` and `true_image_shapes` only if `preprocess` method
has been called. Otherwise returns a default clip window of [0, 0, 1, 1].
Args:
preprocessed_images: the [batch, height, width, channels] image
tensor.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros. Or None if the clip window should cover the full image.
Returns:
a 2-D float32 tensor of the form [batch_size, 4] containing the clip
window for each image in the batch in normalized coordinates (relative to
the resized dimensions) where each clip window is of the form [ymin, xmin,
ymax, xmax] or a default clip window of [0, 0, 1, 1].
"""
if true_image_shapes is None:
return tf.constant([0, 0, 1, 1], dtype=tf.float32)
resized_inputs_shape = shape_utils.combined_static_and_dynamic_shape(
preprocessed_images)
true_heights, true_widths, _ = tf.unstack(
tf.to_float(true_image_shapes), axis=1)
padded_height = tf.to_float(resized_inputs_shape[1])
padded_width = tf.to_float(resized_inputs_shape[2])
return tf.stack(
[
tf.zeros_like(true_heights),
tf.zeros_like(true_widths), true_heights / padded_height,
true_widths / padded_width
],
axis=1)
def predict(self, preprocessed_inputs, true_image_shapes):
"""Predicts unpostprocessed tensors from input tensor.
This function takes an input batch of images and runs it through the forward
......@@ -244,18 +305,24 @@ class SSDMetaArch(model.DetectionModel):
Args:
preprocessed_inputs: a [batch, height, width, channels] image tensor.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) box_encodings: 4-D float tensor of shape [batch_size, num_anchors,
1) preprocessed_inputs: the [batch, height, width, channels] image
tensor.
2) box_encodings: 4-D float tensor of shape [batch_size, num_anchors,
box_code_dimension] containing predicted boxes.
2) class_predictions_with_background: 3-D float tensor of shape
3) class_predictions_with_background: 3-D float tensor of shape
[batch_size, num_anchors, num_classes+1] containing class predictions
(logits) for each of the anchors. Note that this tensor *includes*
background class predictions (at class index 0).
3) feature_maps: a list of tensors where the ith tensor has shape
4) feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i].
4) anchors: 2-D float tensor of shape [num_anchors, 4] containing
5) anchors: 2-D float tensor of shape [num_anchors, 4] containing
the generated anchors in normalized coordinates.
"""
with tf.variable_scope(None, self._extract_features_scope,
......@@ -268,9 +335,13 @@ class SSDMetaArch(model.DetectionModel):
feature_map_spatial_dims,
im_height=image_shape[1],
im_width=image_shape[2])
(box_encodings, class_predictions_with_background
) = self._add_box_predictions_to_feature_maps(feature_maps)
prediction_dict = self._box_predictor.predict(
feature_maps, self._anchor_generator.num_anchors_per_location())
box_encodings = tf.squeeze(prediction_dict['box_encodings'], axis=2)
class_predictions_with_background = prediction_dict[
'class_predictions_with_background']
predictions_dict = {
'preprocessed_inputs': preprocessed_inputs,
'box_encodings': box_encodings,
'class_predictions_with_background': class_predictions_with_background,
'feature_maps': feature_maps,
......@@ -278,68 +349,6 @@ class SSDMetaArch(model.DetectionModel):
}
return predictions_dict
def _add_box_predictions_to_feature_maps(self, feature_maps):
"""Adds box predictors to each feature map and returns concatenated results.
Args:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
Returns:
box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
box_code_dimension] containing predicted boxes.
class_predictions_with_background: 3-D float tensor of shape
[batch_size, num_anchors, num_classes+1] containing class predictions
(logits) for each of the anchors. Note that this tensor *includes*
background class predictions (at class index 0).
Raises:
RuntimeError: if the number of feature maps extracted via the
extract_features method does not match the length of the
num_anchors_per_locations list that was passed to the constructor.
RuntimeError: if box_encodings from the box_predictor does not have
shape of the form [batch_size, num_anchors, 1, code_size].
"""
num_anchors_per_location_list = (
self._anchor_generator.num_anchors_per_location())
if len(feature_maps) != len(num_anchors_per_location_list):
raise RuntimeError('the number of feature maps must match the '
'length of self.anchors.NumAnchorsPerLocation().')
box_encodings_list = []
cls_predictions_with_background_list = []
for idx, (feature_map, num_anchors_per_location
) in enumerate(zip(feature_maps, num_anchors_per_location_list)):
box_predictor_scope = 'BoxPredictor_{}'.format(idx)
box_predictions = self._box_predictor.predict(feature_map,
num_anchors_per_location,
box_predictor_scope)
box_encodings = box_predictions[bpredictor.BOX_ENCODINGS]
cls_predictions_with_background = box_predictions[
bpredictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
box_encodings_shape = box_encodings.get_shape().as_list()
if len(box_encodings_shape) != 4 or box_encodings_shape[2] != 1:
raise RuntimeError('box_encodings from the box_predictor must be of '
'shape `[batch_size, num_anchors, 1, code_size]`; '
'actual shape', box_encodings_shape)
box_encodings = tf.squeeze(box_encodings, axis=2)
box_encodings_list.append(box_encodings)
cls_predictions_with_background_list.append(
cls_predictions_with_background)
num_predictions = sum(
[tf.shape(box_encodings)[1] for box_encodings in box_encodings_list])
num_anchors = self.anchors.num_boxes()
anchors_assert = tf.assert_equal(num_anchors, num_predictions, [
'Mismatch: number of anchors vs number of predictions', num_anchors,
num_predictions
])
with tf.control_dependencies([anchors_assert]):
box_encodings = tf.concat(box_encodings_list, 1)
class_predictions_with_background = tf.concat(
cls_predictions_with_background_list, 1)
return box_encodings, class_predictions_with_background
def _get_feature_map_spatial_dims(self, feature_maps):
"""Return list of spatial dimensions for each feature map in a list.
......@@ -356,7 +365,7 @@ class SSDMetaArch(model.DetectionModel):
]
return [(shape[1], shape[2]) for shape in feature_map_shapes]
def postprocess(self, prediction_dict):
def postprocess(self, prediction_dict, true_image_shapes):
"""Converts prediction tensors to final detections.
This function converts raw predictions tensors to final detection results by
......@@ -370,12 +379,18 @@ class SSDMetaArch(model.DetectionModel):
Args:
prediction_dict: a dictionary holding prediction tensors with
1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
1) preprocessed_inputs: a [batch, height, width, channels] image
tensor.
2) box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
box_code_dimension] containing predicted boxes.
2) class_predictions_with_background: 3-D float tensor of shape
3) class_predictions_with_background: 3-D float tensor of shape
[batch_size, num_anchors, num_classes+1] containing class predictions
(logits) for each of the anchors. Note that this tensor *includes*
background class predictions.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros. Or None, if the clip window should cover the full image.
Returns:
detections: a dictionary containing the following fields
......@@ -393,18 +408,18 @@ class SSDMetaArch(model.DetectionModel):
'class_predictions_with_background' not in prediction_dict):
raise ValueError('prediction_dict does not contain expected entries.')
with tf.name_scope('Postprocessor'):
preprocessed_images = prediction_dict['preprocessed_inputs']
box_encodings = prediction_dict['box_encodings']
class_predictions = prediction_dict['class_predictions_with_background']
detection_boxes, detection_keypoints = self._batch_decode(box_encodings)
detection_boxes = tf.expand_dims(detection_boxes, axis=2)
class_predictions_without_background = tf.slice(class_predictions,
[0, 0, 1],
[-1, -1, -1])
detection_scores = self._score_conversion_fn(
class_predictions_without_background)
clip_window = tf.constant([0, 0, 1, 1], tf.float32)
detection_scores_with_background = self._score_conversion_fn(
class_predictions)
detection_scores = tf.slice(detection_scores_with_background, [0, 0, 1],
[-1, -1, -1])
additional_fields = None
if detection_keypoints is not None:
additional_fields = {
fields.BoxListFields.keypoints: detection_keypoints}
......@@ -412,19 +427,23 @@ class SSDMetaArch(model.DetectionModel):
num_detections) = self._non_max_suppression_fn(
detection_boxes,
detection_scores,
clip_window=clip_window,
clip_window=self._compute_clip_window(
preprocessed_images, true_image_shapes),
additional_fields=additional_fields)
detection_dict = {'detection_boxes': nmsed_boxes,
'detection_scores': nmsed_scores,
'detection_classes': nmsed_classes,
'num_detections': tf.to_float(num_detections)}
detection_dict = {
fields.DetectionResultFields.detection_boxes: nmsed_boxes,
fields.DetectionResultFields.detection_scores: nmsed_scores,
fields.DetectionResultFields.detection_classes: nmsed_classes,
fields.DetectionResultFields.num_detections:
tf.to_float(num_detections)
}
if (nmsed_additional_fields is not None and
fields.BoxListFields.keypoints in nmsed_additional_fields):
detection_dict['detection_keypoints'] = nmsed_additional_fields[
fields.BoxListFields.keypoints]
detection_dict[fields.DetectionResultFields.detection_keypoints] = (
nmsed_additional_fields[fields.BoxListFields.keypoints])
return detection_dict
def loss(self, prediction_dict, scope=None):
def loss(self, prediction_dict, true_image_shapes, scope=None):
"""Compute scalar loss tensors with respect to provided groundtruth.
Calling this function requires that groundtruth tensors have been
......@@ -438,6 +457,10 @@ class SSDMetaArch(model.DetectionModel):
[batch_size, num_anchors, num_classes+1] containing class predictions
(logits) for each of the anchors. Note that this tensor *includes*
background class predictions.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
scope: Optional scope name.
Returns:
......@@ -457,17 +480,17 @@ class SSDMetaArch(model.DetectionModel):
if self._add_summaries:
self._summarize_input(
self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
num_matches = tf.stack(
[match.num_matched_columns() for match in match_list])
location_losses = self._localization_loss(
prediction_dict['box_encodings'],
batch_reg_targets,
ignore_nan_targets=True,
weights=batch_reg_weights)
cls_losses = self._classification_loss(
prediction_dict['class_predictions_with_background'],
batch_cls_targets,
weights=batch_cls_weights)
cls_losses = ops.reduce_sum_trailing_dimensions(
self._classification_loss(
prediction_dict['class_predictions_with_background'],
batch_cls_targets,
weights=batch_cls_weights),
ndims=2)
if self._hard_example_miner:
(localization_loss, classification_loss) = self._apply_hard_mining(
......@@ -487,7 +510,8 @@ class SSDMetaArch(model.DetectionModel):
# Optionally normalize by number of positive matches
normalizer = tf.constant(1.0, dtype=tf.float32)
if self._normalize_loss_by_num_matches:
normalizer = tf.maximum(tf.to_float(tf.reduce_sum(num_matches)), 1.0)
normalizer = tf.maximum(tf.to_float(tf.reduce_sum(batch_reg_weights)),
1.0)
with tf.name_scope('localization_loss'):
localization_loss = ((self._localization_loss_weight / normalizer) *
......@@ -675,7 +699,9 @@ class SSDMetaArch(model.DetectionModel):
[combined_shape[0], combined_shape[1], 4]))
return decoded_boxes, decoded_keypoints
def restore_map(self, from_detection_checkpoint=True):
def restore_map(self,
from_detection_checkpoint=True,
load_all_detection_checkpoint_vars=False):
"""Returns a map of variables to load from a foreign checkpoint.
See parent class for details.
......@@ -684,6 +710,9 @@ class SSDMetaArch(model.DetectionModel):
from_detection_checkpoint: whether to restore from a full detection
checkpoint (with compatible variable names) or to restore from a
classification checkpoint for initialization prior to training.
load_all_detection_checkpoint_vars: whether to load all variables (when
`from_detection_checkpoint` is True). If False, only variables within
the appropriate scopes are included. Default False.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
......@@ -691,10 +720,15 @@ class SSDMetaArch(model.DetectionModel):
"""
variables_to_restore = {}
for variable in tf.global_variables():
if variable.op.name.startswith(self._extract_features_scope):
var_name = variable.op.name
if not from_detection_checkpoint:
var_name = (re.split('^' + self._extract_features_scope + '/',
var_name)[-1])
var_name = variable.op.name
if from_detection_checkpoint and load_all_detection_checkpoint_vars:
variables_to_restore[var_name] = variable
else:
if var_name.startswith(self._extract_features_scope):
if not from_detection_checkpoint:
var_name = (
re.split('^' + self._extract_features_scope + '/',
var_name)[-1])
variables_to_restore[var_name] = variable
return variables_to_restore
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment