Unverified Commit 59f7e80a authored by pkulzc's avatar pkulzc Committed by GitHub
Browse files

Update object detection post processing and fixes boxes padding/clipping issue. (#5026)

* Merged commit includes the following changes:
207771702  by Zhichao Lu:

    Refactoring evaluation utilities so that it is easier to introduce new DetectionEvaluators with eval_metric_ops.

--
207758641  by Zhichao Lu:

    Require tensorflow version 1.9+ for running object detection API.

--
207641470  by Zhichao Lu:

    Clip `num_groundtruth_boxes` in pad_input_data_to_static_shapes() to `max_num_boxes`. This prevents a scenario where tensors are sliced to an invalid range in model_lib.unstack_batch().

--
207621728  by Zhichao Lu:

    This CL adds a FreezableBatchNorm that inherits from the Keras BatchNormalization layer, but supports freezing the `training` parameter at construction time instead of having to do it in the `call` method.

    It also adds a method to the `KerasLayerHyperparams` class that will build an appropriate FreezableBatchNorm layer according to the hyperparameter configuration. If batch_norm is disabled, this method returns and Identity layer.

    These will be used to simplify the conversion to Keras APIs.

--
207610524  by Zhichao Lu:

    Update anchor generators and box predictors for python3 compatibility.

--
207585122  by Zhichao Lu:

    Refactoring convolutional box predictor into separate prediction heads.

--
207549305  by Zhichao Lu:

    Pass all 1s for batch weights if nothing is specified in GT.

--
207336575  by Zhichao Lu:

    Move the new argument 'target_assigner_instance' to the end of the list of arguments to the ssd_meta_arch constructor for backwards compatibility.

--
207327862  by Zhichao Lu:

    Enable support for float output in quantized custom op for postprocessing in SSD Mobilenet model.

--
207323154  by Zhichao Lu:

    Bug fix: change dict.iteritems() to dict.items()

--
207301109  by Zhichao Lu:

    Integrating expected_classification_loss_under_sampling op as an option in the ssd_meta_arch

--
207286221  by Zhichao Lu:

    Adding an option to weight regression loss with foreground scores from the ground truth labels.

--
207231739  by Zhichao Lu:

    Explicitly mentioning the argument names when calling the batch target assigner.

--
207206356  by Zhichao Lu:

    Add include_trainable_variables field to train config to better handle trainable variables.

--
207135930  by Zhichao Lu:

    Internal change.

--
206862541  by Zhichao Lu:

    Do not unpad the outputs from batch_non_max_suppression before sampling.

    Since BalancedPositiveNegativeSampler takes an indicator for valid positions to sample from we can pass the output from NMS directly into Sampler.

--

PiperOrigin-RevId: 207771702

* Remove unused doc.
parent fb6bc29b
......@@ -44,7 +44,7 @@ job using GPUs. A sample YAML file is given below:
```
trainingInput:
runtimeVersion: "1.8"
runtimeVersion: "1.9"
scaleTier: CUSTOM
masterType: standard_gpu
workerCount: 9
......@@ -73,7 +73,7 @@ following command:
```bash
# From tensorflow/models/research/
gcloud ml-engine jobs submit training object_detection_`date +%m_%d_%Y_%H_%M_%S` \
--runtime-version 1.8 \
--runtime-version 1.9 \
--job-dir=gs://${MODEL_DIR} \
--packages dist/object_detection-0.1.tar.gz,slim/dist/slim-0.1.tar.gz,/tmp/pycocotools/pycocotools-2.0.tar.gz \
--module-name object_detection.model_main \
......@@ -93,7 +93,7 @@ Google Cloud Storage.
Users can monitor the progress of their training job on the [ML Engine
Dashboard](https://console.cloud.google.com/mlengine/jobs).
Note: This sample is supported for use with 1.8 runtime version.
Note: This sample is supported for use with 1.9 runtime version.
## Running a TPU Training Job on CMLE
......@@ -105,7 +105,7 @@ gcloud ml-engine jobs submit training `whoami`_object_detection_`date +%m_%d_%Y_
--job-dir=gs://${MODEL_DIR} \
--packages dist/object_detection-0.1.tar.gz,slim/dist/slim-0.1.tar.gz,/tmp/pycocotools/pycocotools-2.0.tar.gz \
--module-name object_detection.model_tpu_main \
--runtime-version 1.8 \
--runtime-version 1.9 \
--scale-tier BASIC_TPU \
--region us-central1 \
-- \
......@@ -133,7 +133,7 @@ job:
```bash
gcloud ml-engine jobs submit training object_detection_eval_`date +%m_%d_%Y_%H_%M_%S` \
--runtime-version 1.8 \
--runtime-version 1.9 \
--job-dir=gs://${MODEL_DIR} \
--packages dist/object_detection-0.1.tar.gz,slim/dist/slim-0.1.tar.gz,/tmp/pycocotools/pycocotools-2.0.tar.gz \
--module-name object_detection.model_main \
......
......@@ -221,6 +221,14 @@ def pad_input_data_to_static_shapes(tensor_dict, max_num_boxes, num_classes,
for tensor_name in tensor_dict:
padded_tensor_dict[tensor_name] = shape_utils.pad_or_clip_nd(
tensor_dict[tensor_name], padding_shapes[tensor_name])
# Make sure that the number of groundtruth boxes now reflects the
# padded/clipped tensors.
if fields.InputDataFields.num_groundtruth_boxes in padded_tensor_dict:
padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = (
tf.minimum(
padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes],
max_num_boxes))
return padded_tensor_dict
......
......@@ -663,6 +663,8 @@ class PadInputDataToStaticShapesFnTest(tf.test.TestCase):
tf.placeholder(tf.float32, [None, 4]),
fields.InputDataFields.groundtruth_classes:
tf.placeholder(tf.int32, [None, 3]),
fields.InputDataFields.num_groundtruth_boxes:
tf.placeholder(tf.int32, [])
}
padded_tensor_dict = inputs.pad_input_data_to_static_shapes(
tensor_dict=input_tensor_dict,
......@@ -685,6 +687,8 @@ class PadInputDataToStaticShapesFnTest(tf.test.TestCase):
np.random.rand(5, 4),
input_tensor_dict[fields.InputDataFields.groundtruth_classes]:
np.random.rand(2, 3),
input_tensor_dict[fields.InputDataFields.num_groundtruth_boxes]:
5,
})
self.assertAllEqual(
......@@ -692,6 +696,9 @@ class PadInputDataToStaticShapesFnTest(tf.test.TestCase):
self.assertAllEqual(
out_tensor_dict[fields.InputDataFields.groundtruth_classes].shape,
[3, 3])
self.assertEqual(
out_tensor_dict[fields.InputDataFields.num_groundtruth_boxes],
3)
def test_do_not_pad_dynamic_images(self):
input_tensor_dict = {
......
......@@ -172,7 +172,8 @@ def _create_losses(input_queue, create_model_fn, train_config):
"""
detection_model = create_model_fn()
(images, _, groundtruth_boxes_list, groundtruth_classes_list,
groundtruth_masks_list, groundtruth_keypoints_list, _) = get_inputs(
groundtruth_masks_list, groundtruth_keypoints_list,
groundtruth_weights_list) = get_inputs(
input_queue,
detection_model.num_classes,
train_config.merge_multiple_label_boxes,
......@@ -193,10 +194,12 @@ def _create_losses(input_queue, create_model_fn, train_config):
if any(keypoints is None for keypoints in groundtruth_keypoints_list):
groundtruth_keypoints_list = None
detection_model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list,
groundtruth_masks_list,
groundtruth_keypoints_list)
detection_model.provide_groundtruth(
groundtruth_boxes_list,
groundtruth_classes_list,
groundtruth_masks_list,
groundtruth_keypoints_list,
groundtruth_weights_list=groundtruth_weights_list)
prediction_dict = detection_model.predict(images, true_image_shapes)
losses_dict = detection_model.loss(prediction_dict, true_image_shapes)
......
......@@ -97,6 +97,7 @@ from functools import partial
import tensorflow as tf
from object_detection.anchor_generators import grid_anchor_generator
from object_detection.builders import box_predictor_builder
from object_detection.core import box_list
from object_detection.core import box_list_ops
from object_detection.core import box_predictor
......@@ -105,7 +106,6 @@ from object_detection.core import model
from object_detection.core import post_processing
from object_detection.core import standard_fields as fields
from object_detection.core import target_assigner
from object_detection.predictors import convolutional_box_predictor
from object_detection.utils import ops
from object_detection.utils import shape_utils
......@@ -413,17 +413,17 @@ class FasterRCNNMetaArch(model.DetectionModel):
self._first_stage_minibatch_size = first_stage_minibatch_size
self._first_stage_sampler = first_stage_sampler
self._first_stage_box_predictor = (
convolutional_box_predictor.ConvolutionalBoxPredictor(
self._is_training,
box_predictor_builder.build_convolutional_box_predictor(
is_training=self._is_training,
num_classes=1,
conv_hyperparams_fn=self._first_stage_box_predictor_arg_scope_fn,
min_depth=0,
max_depth=0,
num_layers_before_predictor=0,
use_dropout=False,
dropout_keep_prob=1.0,
box_code_size=self._box_coder.code_size,
kernel_size=1,
box_code_size=self._box_coder.code_size))
num_layers_before_predictor=0,
min_depth=0,
max_depth=0))
self._first_stage_nms_score_threshold = first_stage_nms_score_threshold
self._first_stage_nms_iou_threshold = first_stage_nms_iou_threshold
......@@ -1236,11 +1236,13 @@ class FasterRCNNMetaArch(model.DetectionModel):
proposal_boxes = tf.stop_gradient(proposal_boxes)
if not self._hard_example_miner:
(groundtruth_boxlists, groundtruth_classes_with_background_list, _,
_) = self._format_groundtruth_data(true_image_shapes)
groundtruth_weights_list
) = self._format_groundtruth_data(true_image_shapes)
(proposal_boxes, proposal_scores,
num_proposals) = self._unpad_proposals_and_sample_box_classifier_batch(
num_proposals) = self._sample_box_classifier_batch(
proposal_boxes, proposal_scores, num_proposals,
groundtruth_boxlists, groundtruth_classes_with_background_list)
groundtruth_boxlists, groundtruth_classes_with_background_list,
groundtruth_weights_list)
# normalize proposal boxes
def normalize_boxes(args):
proposal_boxes_per_image = args[0]
......@@ -1253,14 +1255,15 @@ class FasterRCNNMetaArch(model.DetectionModel):
normalize_boxes, elems=[proposal_boxes, image_shapes], dtype=tf.float32)
return normalized_proposal_boxes, proposal_scores, num_proposals
def _unpad_proposals_and_sample_box_classifier_batch(
def _sample_box_classifier_batch(
self,
proposal_boxes,
proposal_scores,
num_proposals,
groundtruth_boxlists,
groundtruth_classes_with_background_list):
"""Unpads proposals and samples a minibatch for second stage.
groundtruth_classes_with_background_list,
groundtruth_weights_list):
"""Samples a minibatch for second stage.
Args:
proposal_boxes: A float tensor with shape
......@@ -1278,6 +1281,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
groundtruth_classes_with_background_list: A list of 2-D one-hot
(or k-hot) tensors of shape [num_boxes, num_classes+1] containing the
class targets with the 0th index assumed to map to the background class.
groundtruth_weights_list: A list of 1-D tensors of shape [num_boxes]
indicating the weight associated with the groundtruth boxes.
Returns:
proposal_boxes: A float tensor with shape
......@@ -1298,31 +1303,23 @@ class FasterRCNNMetaArch(model.DetectionModel):
single_image_proposal_scores,
single_image_num_proposals,
single_image_groundtruth_boxlist,
single_image_groundtruth_classes_with_background) in zip(
single_image_groundtruth_classes_with_background,
single_image_groundtruth_weights) in zip(
tf.unstack(proposal_boxes),
tf.unstack(proposal_scores),
tf.unstack(num_proposals),
groundtruth_boxlists,
groundtruth_classes_with_background_list):
static_shape = single_image_proposal_boxes.get_shape()
sliced_static_shape = tf.TensorShape([tf.Dimension(None),
static_shape.dims[-1]])
single_image_proposal_boxes = tf.slice(
single_image_proposal_boxes,
[0, 0],
[single_image_num_proposals, -1])
single_image_proposal_boxes.set_shape(sliced_static_shape)
single_image_proposal_scores = tf.slice(single_image_proposal_scores,
[0],
[single_image_num_proposals])
groundtruth_classes_with_background_list,
groundtruth_weights_list):
single_image_boxlist = box_list.BoxList(single_image_proposal_boxes)
single_image_boxlist.add_field(fields.BoxListFields.scores,
single_image_proposal_scores)
sampled_boxlist = self._sample_box_classifier_minibatch(
sampled_boxlist = self._sample_box_classifier_minibatch_single_image(
single_image_boxlist,
single_image_num_proposals,
single_image_groundtruth_boxlist,
single_image_groundtruth_classes_with_background)
single_image_groundtruth_classes_with_background,
single_image_groundtruth_weights)
sampled_padded_boxlist = box_list_ops.pad_or_clip_box_list(
sampled_boxlist,
num_boxes=self._second_stage_batch_size)
......@@ -1394,18 +1391,23 @@ class FasterRCNNMetaArch(model.DetectionModel):
resized_masks_list.append(resized_mask)
groundtruth_masks_list = resized_masks_list
groundtruth_weights_list = None
if self.groundtruth_has_field(fields.BoxListFields.weights):
groundtruth_weights_list = self.groundtruth_lists(
fields.BoxListFields.weights)
else:
# Set weights for all batch elements equally to 1.0
groundtruth_weights_list = []
for groundtruth_classes in groundtruth_classes_with_background_list:
num_gt = tf.shape(groundtruth_classes)[0]
groundtruth_weights = tf.ones(num_gt)
groundtruth_weights_list.append(groundtruth_weights)
return (groundtruth_boxlists, groundtruth_classes_with_background_list,
groundtruth_masks_list, groundtruth_weights_list)
def _sample_box_classifier_minibatch(self,
proposal_boxlist,
groundtruth_boxlist,
groundtruth_classes_with_background):
def _sample_box_classifier_minibatch_single_image(
self, proposal_boxlist, num_valid_proposals, groundtruth_boxlist,
groundtruth_classes_with_background, groundtruth_weights):
"""Samples a mini-batch of proposals to be sent to the box classifier.
Helper function for self._postprocess_rpn.
......@@ -1413,12 +1415,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
Args:
proposal_boxlist: A BoxList containing K proposal boxes in absolute
coordinates.
num_valid_proposals: Number of valid proposals in the proposal boxlist.
groundtruth_boxlist: A Boxlist containing N groundtruth object boxes in
absolute coordinates.
groundtruth_classes_with_background: A tensor with shape
`[N, self.num_classes + 1]` representing groundtruth classes. The
classes are assumed to be k-hot encoded, and include background as the
zero-th class.
groundtruth_weights: Weights attached to the groundtruth_boxes.
Returns:
a BoxList contained sampled proposals.
......@@ -1428,15 +1432,19 @@ class FasterRCNNMetaArch(model.DetectionModel):
groundtruth_boxlist,
groundtruth_classes_with_background,
unmatched_class_label=tf.constant(
[1] + self._num_classes * [0], dtype=tf.float32))
[1] + self._num_classes * [0], dtype=tf.float32),
groundtruth_weights=groundtruth_weights)
# Selects all boxes as candidates if none of them is selected according
# to cls_weights. This could happen as boxes within certain IOU ranges
# are ignored. If triggered, the selected boxes will still be ignored
# during loss computation.
cls_weights += tf.to_float(tf.equal(tf.reduce_sum(cls_weights), 0))
positive_indicator = tf.greater(tf.argmax(cls_targets, axis=1), 0)
valid_indicator = tf.logical_and(
tf.range(proposal_boxlist.num_boxes()) < num_valid_proposals,
cls_weights > 0
)
sampled_indices = self._second_stage_sampler.subsample(
tf.cast(cls_weights, tf.bool),
valid_indicator,
self._second_stage_batch_size,
positive_indicator)
return box_list_ops.boolean_mask(proposal_boxlist, sampled_indices)
......@@ -1704,9 +1712,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
with tf.name_scope('RPNLoss'):
(batch_cls_targets, batch_cls_weights, batch_reg_targets,
batch_reg_weights, _) = target_assigner.batch_assign_targets(
self._proposal_target_assigner, box_list.BoxList(anchors),
groundtruth_boxlists,
len(groundtruth_boxlists) * [None],
target_assigner=self._proposal_target_assigner,
anchors_batch=box_list.BoxList(anchors),
gt_box_batch=groundtruth_boxlists,
gt_class_targets_batch=(len(groundtruth_boxlists) * [None]),
gt_weights_batch=groundtruth_weights_list)
batch_cls_targets = tf.squeeze(batch_cls_targets, axis=2)
......@@ -1827,10 +1836,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
(batch_cls_targets_with_background, batch_cls_weights, batch_reg_targets,
batch_reg_weights, _) = target_assigner.batch_assign_targets(
self._detector_target_assigner,
proposal_boxlists,
groundtruth_boxlists,
groundtruth_classes_with_background_list,
target_assigner=self._detector_target_assigner,
anchors_batch=proposal_boxlists,
gt_box_batch=groundtruth_boxlists,
gt_class_targets_batch=groundtruth_classes_with_background_list,
unmatched_class_label=tf.constant(
[1] + self._num_classes * [0], dtype=tf.float32),
gt_weights_batch=groundtruth_weights_list)
......@@ -1908,9 +1917,12 @@ class FasterRCNNMetaArch(model.DetectionModel):
unmatched_mask_label = tf.zeros(image_shape[1:3], dtype=tf.float32)
(batch_mask_targets, _, _, batch_mask_target_weights,
_) = target_assigner.batch_assign_targets(
self._detector_target_assigner, proposal_boxlists,
groundtruth_boxlists, groundtruth_masks_list, unmatched_mask_label,
groundtruth_weights_list)
target_assigner=self._detector_target_assigner,
anchors_batch=proposal_boxlists,
gt_box_batch=groundtruth_boxlists,
gt_class_targets_batch=groundtruth_masks_list,
unmatched_class_label=unmatched_mask_label,
gt_weights_batch=groundtruth_weights_list)
# Pad the prediction_masks with to add zeros for background class to be
# consistent with class predictions.
......
......@@ -230,9 +230,14 @@ class FasterRCNNMetaArchTest(
tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
tf.constant([[1, 0], [1, 0]], dtype=tf.float32)
]
groundtruth_weights_list = [
tf.constant([1, 1], dtype=tf.float32),
tf.constant([1, 1], dtype=tf.float32)]
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
model.provide_groundtruth(
groundtruth_boxes_list,
groundtruth_classes_list,
groundtruth_weights_list=groundtruth_weights_list)
result_tensor_dict = model.predict(preprocessed_inputs, true_image_shapes)
mask_shape_1 = 1 if masks_are_class_agnostic else model._num_classes
......
......@@ -511,10 +511,14 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
groundtruth_classes_list = [
tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
tf.constant([[1, 0], [1, 0]], dtype=tf.float32)]
groundtruth_weights_list = [
tf.constant([1, 1], dtype=tf.float32),
tf.constant([1, 1], dtype=tf.float32)]
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
model.provide_groundtruth(
groundtruth_boxes_list,
groundtruth_classes_list,
groundtruth_weights_list=groundtruth_weights_list)
result_tensor_dict = model.predict(preprocessed_inputs, true_image_shapes)
expected_shapes = {
......@@ -663,10 +667,15 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)]
groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
tf.constant([[1, 0], [1, 0]], dtype=tf.float32)]
groundtruth_weights_list = [
tf.constant([1, 1], dtype=tf.float32),
tf.constant([1, 1], dtype=tf.float32)
]
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
model.provide_groundtruth(
groundtruth_boxes_list,
groundtruth_classes_list,
groundtruth_weights_list=groundtruth_weights_list)
proposals = model.postprocess({
'rpn_box_encodings': rpn_box_encodings,
'rpn_objectness_predictions_with_background':
......
......@@ -243,7 +243,9 @@ class SSDMetaArch(model.DetectionModel):
freeze_batchnorm=False,
inplace_batchnorm_update=False,
add_background_class=True,
random_example_sampler=None):
random_example_sampler=None,
expected_classification_loss_under_sampling=None,
target_assigner_instance=None):
"""SSDMetaArch Constructor.
TODO(rathodv,jonathanhuang): group NMS parameters + score converter into
......@@ -308,6 +310,9 @@ class SSDMetaArch(model.DetectionModel):
example miner can both be applied to the model. In that case, random
sampler will take effect first and hard example miner can only process
the random sampled examples.
expected_classification_loss_under_sampling: If not None, use
to calcualte classification loss by background/foreground weighting.
target_assigner_instance: target_assigner.TargetAssigner instance to use.
"""
super(SSDMetaArch, self).__init__(num_classes=box_predictor.num_classes)
self._is_training = is_training
......@@ -342,11 +347,14 @@ class SSDMetaArch(model.DetectionModel):
self._unmatched_class_label = tf.constant((self.num_classes + 1) * [0],
tf.float32)
self._target_assigner = target_assigner.TargetAssigner(
self._region_similarity_calculator,
self._matcher,
self._box_coder,
negative_class_weight=negative_class_weight)
if target_assigner_instance:
self._target_assigner = target_assigner_instance
else:
self._target_assigner = target_assigner.TargetAssigner(
self._region_similarity_calculator,
self._matcher,
self._box_coder,
negative_class_weight=negative_class_weight)
self._classification_loss = classification_loss
self._localization_loss = localization_loss
......@@ -365,6 +373,8 @@ class SSDMetaArch(model.DetectionModel):
self._anchors = None
self._add_summaries = add_summaries
self._batched_prediction_tensor_names = []
self._expected_classification_loss_under_sampling = (
expected_classification_loss_under_sampling)
@property
def anchors(self):
......@@ -696,19 +706,34 @@ class SSDMetaArch(model.DetectionModel):
batch_reg_targets,
ignore_nan_targets=True,
weights=batch_reg_weights)
cls_losses = ops.reduce_sum_trailing_dimensions(
self._classification_loss(
prediction_dict['class_predictions_with_background'],
batch_cls_targets,
weights=batch_cls_weights),
ndims=2)
if self._hard_example_miner:
cls_losses = self._classification_loss(
prediction_dict['class_predictions_with_background'],
batch_cls_targets,
weights=batch_cls_weights)
if self._expected_classification_loss_under_sampling:
if cls_losses.get_shape().ndims == 3:
batch_size, num_anchors, num_classes = cls_losses.get_shape()
cls_losses = tf.reshape(cls_losses, [batch_size, -1])
batch_cls_targets = tf.reshape(
batch_cls_targets, [batch_size, num_anchors * num_classes, -1])
batch_cls_targets = tf.concat(
[1 - batch_cls_targets, batch_cls_targets], axis=-1)
cls_losses = self._expected_classification_loss_under_sampling(
batch_cls_targets, cls_losses)
classification_loss = tf.reduce_sum(cls_losses)
localization_loss = tf.reduce_sum(location_losses)
elif self._hard_example_miner:
cls_losses = ops.reduce_sum_trailing_dimensions(cls_losses, ndims=2)
(localization_loss, classification_loss) = self._apply_hard_mining(
location_losses, cls_losses, prediction_dict, match_list)
if self._add_summaries:
self._hard_example_miner.summarize()
else:
cls_losses = ops.reduce_sum_trailing_dimensions(cls_losses, ndims=2)
if self._add_summaries:
class_ids = tf.argmax(batch_cls_targets, axis=2)
flattened_class_ids = tf.reshape(class_ids, [-1])
......@@ -993,4 +1018,3 @@ class SSDMetaArch(model.DetectionModel):
variables_to_restore[var_name] = variable
return variables_to_restore
......@@ -26,7 +26,9 @@ from object_detection.core import box_list
from object_detection.core import losses
from object_detection.core import post_processing
from object_detection.core import region_similarity_calculator as sim_calc
from object_detection.core import target_assigner
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.utils import ops
from object_detection.utils import test_case
from object_detection.utils import test_utils
......@@ -117,6 +119,10 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
normalize_loc_loss_by_codesize=False,
add_background_class=True,
random_example_sampling=False,
weight_regression_loss_by_score=False,
use_expected_classification_loss_under_sampling=False,
minimum_negative_sampling=1,
desired_negative_sampling_ratio=3,
use_keras=False):
is_training = False
num_classes = 1
......@@ -163,6 +169,20 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
random_example_sampler = sampler.BalancedPositiveNegativeSampler(
positive_fraction=0.5)
target_assigner_instance = target_assigner.TargetAssigner(
region_similarity_calculator,
mock_matcher,
mock_box_coder,
negative_class_weight=negative_class_weight,
weight_regression_loss_by_score=weight_regression_loss_by_score)
expected_classification_loss_under_sampling = None
if use_expected_classification_loss_under_sampling:
expected_classification_loss_under_sampling = functools.partial(
ops.expected_classification_loss_under_sampling,
minimum_negative_sampling=minimum_negative_sampling,
desired_negative_sampling_ratio=desired_negative_sampling_ratio)
code_size = 4
model = ssd_meta_arch.SSDMetaArch(
is_training,
......@@ -183,12 +203,15 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
localization_loss_weight,
normalize_loss_by_num_matches,
hard_example_miner,
target_assigner_instance=target_assigner_instance,
add_summaries=False,
normalize_loc_loss_by_codesize=normalize_loc_loss_by_codesize,
freeze_batchnorm=False,
inplace_batchnorm_update=False,
add_background_class=add_background_class,
random_example_sampler=random_example_sampler)
random_example_sampler=random_example_sampler,
expected_classification_loss_under_sampling=
expected_classification_loss_under_sampling)
return model, num_classes, mock_anchor_generator.num_anchors(), code_size
def test_preprocess_preserves_shapes_with_dynamic_input_image(
......@@ -470,6 +493,94 @@ class SsdMetaArchTest(test_case.TestCase, parameterized.TestCase):
groundtruth_classes1 = np.array([[0, 1]], dtype=np.float32)
groundtruth_classes2 = np.array([[0, 1]], dtype=np.float32)
expected_localization_loss = 0.0
expected_classification_loss = (
batch_size * num_anchors * (num_classes + 1) * np.log(2.0))
(localization_loss, classification_loss) = self.execute(
graph_fn, [
preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,
groundtruth_classes1, groundtruth_classes2
])
self.assertAllClose(localization_loss, expected_localization_loss)
self.assertAllClose(classification_loss, expected_classification_loss)
def test_loss_with_expected_classification_loss(self, use_keras):
with tf.Graph().as_default():
_, num_classes, num_anchors, _ = self._create_model(use_keras=use_keras)
def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2,
groundtruth_classes1, groundtruth_classes2):
groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2]
groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2]
model, _, _, _ = self._create_model(
apply_hard_mining=False,
add_background_class=True,
use_expected_classification_loss_under_sampling=True,
minimum_negative_sampling=1,
desired_negative_sampling_ratio=desired_negative_sampling_ratio)
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
prediction_dict = model.predict(
preprocessed_tensor, true_image_shapes=None)
loss_dict = model.loss(prediction_dict, true_image_shapes=None)
return (loss_dict['Loss/localization_loss'],
loss_dict['Loss/classification_loss'])
batch_size = 2
desired_negative_sampling_ratio = 4
preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32)
groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32)
groundtruth_classes1 = np.array([[1]], dtype=np.float32)
groundtruth_classes2 = np.array([[1]], dtype=np.float32)
expected_localization_loss = 0.0
expected_classification_loss = (
batch_size * (desired_negative_sampling_ratio * num_anchors +
num_classes * num_anchors) * np.log(2.0))
(localization_loss, classification_loss) = self.execute(
graph_fn, [
preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,
groundtruth_classes1, groundtruth_classes2
])
self.assertAllClose(localization_loss, expected_localization_loss)
self.assertAllClose(classification_loss, expected_classification_loss)
def test_loss_results_are_correct_with_weight_regression_loss_by_score(
self, use_keras):
with tf.Graph().as_default():
_, num_classes, num_anchors, _ = self._create_model(
use_keras=use_keras,
add_background_class=False,
weight_regression_loss_by_score=True)
def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2,
groundtruth_classes1, groundtruth_classes2):
groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2]
groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2]
model, _, _, _ = self._create_model(
use_keras=use_keras,
apply_hard_mining=False,
add_background_class=False,
weight_regression_loss_by_score=True)
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list)
prediction_dict = model.predict(
preprocessed_tensor, true_image_shapes=None)
loss_dict = model.loss(prediction_dict, true_image_shapes=None)
return (loss_dict['Loss/localization_loss'],
loss_dict['Loss/classification_loss'])
batch_size = 2
preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
groundtruth_boxes1 = np.array([[0, 0, 1, 1]], dtype=np.float32)
groundtruth_boxes2 = np.array([[0, 0, 1, 1]], dtype=np.float32)
groundtruth_classes1 = np.array([[0, 1]], dtype=np.float32)
groundtruth_classes2 = np.array([[1, 0]], dtype=np.float32)
expected_localization_loss = 0.25
expected_classification_loss = (
batch_size * num_anchors * (num_classes + 1) * np.log(2.0))
(localization_loss, classification_loss) = self.execute(
......
......@@ -201,14 +201,8 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
for key, value in iter(box_metrics.items())}
return box_metrics
def get_estimator_eval_metric_ops(self, image_id, groundtruth_boxes,
groundtruth_classes,
detection_boxes,
detection_scores, detection_classes,
groundtruth_is_crowd=None,
num_gt_boxes_per_image=None,
num_det_boxes_per_image=None):
"""Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`.
def get_estimator_eval_metric_ops(self, eval_dict):
"""Returns a dictionary of eval metric ops.
Note that once value_op is called, the detections and groundtruth added via
update_op are cleared.
......@@ -218,35 +212,18 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
tensors need not be present.
Args:
image_id: string/integer tensor of shape [batch] with unique identifiers
for the images.
groundtruth_boxes: float32 tensor of shape [batch, num_boxes, 4]
containing `num_boxes` groundtruth boxes of the format
[ymin, xmin, ymax, xmax] in absolute image coordinates.
groundtruth_classes: int32 tensor of shape [batch, num_boxes] containing
1-indexed groundtruth classes for the boxes.
detection_boxes: float32 tensor of shape [batch, num_boxes, 4] containing
`num_boxes` detection boxes of the format [ymin, xmin, ymax, xmax]
in absolute image coordinates.
detection_scores: float32 tensor of shape [batch, num_boxes] containing
detection scores for the boxes.
detection_classes: int32 tensor of shape [batch, num_boxes] containing
1-indexed detection classes for the boxes.
groundtruth_is_crowd: bool tensor of shape [batch, num_boxes] containing
is_crowd annotations. This field is optional, and if not passed, then
all boxes are treated as *not* is_crowd.
num_gt_boxes_per_image: int32 tensor of shape [batch] containing the
number of groundtruth boxes per image. If None, will assume no padding
in groundtruth tensors.
num_det_boxes_per_image: int32 tensor of shape [batch] containing the
number of detection boxes per image. If None, will assume no padding in
the detection tensors.
eval_dict: A dictionary that holds tensors for evaluating object detection
performance. For single-image evaluation, this dictionary may be
produced from eval_util.result_dict_for_single_example(). If multi-image
evaluation, `eval_dict` should contain the fields
'num_groundtruth_boxes_per_image' and 'num_det_boxes_per_image' to
properly unpad the tensors from the batch.
Returns:
a dictionary of metric names to tuple of value_op and update_op that can
be used as eval metric ops in tf.EstimatorSpec. Note that all update ops
must be run together and similarly all value ops must be run together to
guarantee correct behaviour.
be used as eval metric ops in tf.estimator.EstimatorSpec. Note that all
update ops must be run together and similarly all value ops must be run
together to guarantee correct behaviour.
"""
def update_op(
image_id_batched,
......@@ -278,6 +255,22 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
'detection_scores': det_score[:num_det_box],
'detection_classes': det_class[:num_det_box]})
# Unpack items from the evaluation dictionary.
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
image_id = eval_dict[input_data_fields.key]
groundtruth_boxes = eval_dict[input_data_fields.groundtruth_boxes]
groundtruth_classes = eval_dict[input_data_fields.groundtruth_classes]
groundtruth_is_crowd = eval_dict.get(
input_data_fields.groundtruth_is_crowd, None)
detection_boxes = eval_dict[detection_fields.detection_boxes]
detection_scores = eval_dict[detection_fields.detection_scores]
detection_classes = eval_dict[detection_fields.detection_classes]
num_gt_boxes_per_image = eval_dict.get(
'num_groundtruth_boxes_per_image', None)
num_det_boxes_per_image = eval_dict.get(
'num_groundtruth_boxes_per_image', None)
if groundtruth_is_crowd is None:
groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
if not image_id.shape.as_list():
......@@ -553,42 +546,22 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
for key, value in mask_metrics.iteritems()}
return mask_metrics
def get_estimator_eval_metric_ops(self, image_id, groundtruth_boxes,
groundtruth_classes,
groundtruth_instance_masks,
detection_scores, detection_classes,
detection_masks, groundtruth_is_crowd=None):
"""Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`.
def get_estimator_eval_metric_ops(self, eval_dict):
"""Returns a dictionary of eval metric ops.
Note that once value_op is called, the detections and groundtruth added via
update_op are cleared.
Args:
image_id: Unique string/integer identifier for the image.
groundtruth_boxes: float32 tensor of shape [num_boxes, 4] containing
`num_boxes` groundtruth boxes of the format
[ymin, xmin, ymax, xmax] in absolute image coordinates.
groundtruth_classes: int32 tensor of shape [num_boxes] containing
1-indexed groundtruth classes for the boxes.
groundtruth_instance_masks: uint8 tensor array of shape
[num_boxes, image_height, image_width] containing groundtruth masks
corresponding to the boxes. The elements of the array must be in {0, 1}.
detection_scores: float32 tensor of shape [num_boxes] containing
detection scores for the boxes.
detection_classes: int32 tensor of shape [num_boxes] containing
1-indexed detection classes for the boxes.
detection_masks: uint8 tensor array of shape
[num_boxes, image_height, image_width] containing instance masks
corresponding to the boxes. The elements of the array must be in {0, 1}.
groundtruth_is_crowd: bool tensor of shape [batch, num_boxes] containing
is_crowd annotations. This field is optional, and if not passed, then
all boxes are treated as *not* is_crowd.
eval_dict: A dictionary that holds tensors for evaluating object detection
performance. This dictionary may be produced from
eval_util.result_dict_for_single_example().
Returns:
a dictionary of metric names to tuple of value_op and update_op that can
be used as eval metric ops in tf.EstimatorSpec. Note that all update ops
must be run together and similarly all value ops must be run together to
guarantee correct behaviour.
be used as eval metric ops in tf.estimator.EstimatorSpec. Note that all
update ops must be run together and similarly all value ops must be run
together to guarantee correct behaviour.
"""
def update_op(
image_id,
......@@ -599,6 +572,7 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
detection_scores,
detection_classes,
detection_masks):
"""Update op for metrics."""
self.add_single_ground_truth_image_info(
image_id,
{'groundtruth_boxes': groundtruth_boxes,
......@@ -611,6 +585,20 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
'detection_classes': detection_classes,
'detection_masks': detection_masks})
# Unpack items from the evaluation dictionary.
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
image_id = eval_dict[input_data_fields.key]
groundtruth_boxes = eval_dict[input_data_fields.groundtruth_boxes]
groundtruth_classes = eval_dict[input_data_fields.groundtruth_classes]
groundtruth_instance_masks = eval_dict[
input_data_fields.groundtruth_instance_masks]
groundtruth_is_crowd = eval_dict.get(
input_data_fields.groundtruth_is_crowd, None)
detection_scores = eval_dict[detection_fields.detection_scores]
detection_classes = eval_dict[detection_fields.detection_classes]
detection_masks = eval_dict[detection_fields.detection_masks]
if groundtruth_is_crowd is None:
groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
update_op = tf.py_func(update_op, [image_id,
......
......@@ -258,12 +258,18 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
detection_scores = tf.placeholder(tf.float32, shape=(None))
detection_classes = tf.placeholder(tf.float32, shape=(None))
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
image_id, groundtruth_boxes,
groundtruth_classes,
detection_boxes,
detection_scores,
detection_classes)
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
eval_dict = {
input_data_fields.key: image_id,
input_data_fields.groundtruth_boxes: groundtruth_boxes,
input_data_fields.groundtruth_classes: groundtruth_classes,
detection_fields.detection_boxes: detection_boxes,
detection_fields.detection_scores: detection_scores,
detection_fields.detection_classes: detection_classes
}
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(eval_dict)
_, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
......@@ -336,9 +342,18 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
detection_scores = tf.placeholder(tf.float32, shape=(None))
detection_classes = tf.placeholder(tf.float32, shape=(None))
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
image_id, groundtruth_boxes, groundtruth_classes, detection_boxes,
detection_scores, detection_classes)
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
eval_dict = {
input_data_fields.key: image_id,
input_data_fields.groundtruth_boxes: groundtruth_boxes,
input_data_fields.groundtruth_classes: groundtruth_classes,
detection_fields.detection_boxes: detection_boxes,
detection_fields.detection_scores: detection_scores,
detection_fields.detection_classes: detection_classes
}
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(eval_dict)
_, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
......@@ -426,12 +441,18 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
detection_scores = tf.placeholder(tf.float32, shape=(batch_size, None))
detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
image_id, groundtruth_boxes,
groundtruth_classes,
detection_boxes,
detection_scores,
detection_classes)
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
eval_dict = {
input_data_fields.key: image_id,
input_data_fields.groundtruth_boxes: groundtruth_boxes,
input_data_fields.groundtruth_classes: groundtruth_classes,
detection_fields.detection_boxes: detection_boxes,
detection_fields.detection_scores: detection_scores,
detection_fields.detection_classes: detection_classes
}
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(eval_dict)
_, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
......@@ -486,14 +507,20 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
num_det_boxes_per_image = tf.placeholder(tf.int32, shape=(None))
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
image_id, groundtruth_boxes,
groundtruth_classes,
detection_boxes,
detection_scores,
detection_classes,
num_gt_boxes_per_image=num_gt_boxes_per_image,
num_det_boxes_per_image=num_det_boxes_per_image)
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
eval_dict = {
input_data_fields.key: image_id,
input_data_fields.groundtruth_boxes: groundtruth_boxes,
input_data_fields.groundtruth_classes: groundtruth_classes,
detection_fields.detection_boxes: detection_boxes,
detection_fields.detection_scores: detection_scores,
detection_fields.detection_classes: detection_classes,
'num_groundtruth_boxes_per_image': num_gt_boxes_per_image,
'num_det_boxes_per_image': num_det_boxes_per_image
}
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(eval_dict)
_, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
......@@ -642,13 +669,19 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
detection_classes = tf.placeholder(tf.float32, shape=(None))
detection_masks = tf.placeholder(tf.uint8, shape=(None, None, None))
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
image_id, groundtruth_boxes,
groundtruth_classes,
groundtruth_masks,
detection_scores,
detection_classes,
detection_masks)
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
eval_dict = {
input_data_fields.key: image_id,
input_data_fields.groundtruth_boxes: groundtruth_boxes,
input_data_fields.groundtruth_classes: groundtruth_classes,
input_data_fields.groundtruth_instance_masks: groundtruth_masks,
detection_fields.detection_scores: detection_scores,
detection_fields.detection_classes: detection_classes,
detection_fields.detection_masks: detection_masks,
}
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(eval_dict)
_, update_op = eval_metric_ops['DetectionMasks_Precision/mAP']
......
......@@ -234,6 +234,9 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
gt_keypoints_list = None
if fields.InputDataFields.groundtruth_keypoints in labels:
gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
gt_weights_list = None
if fields.InputDataFields.groundtruth_weights in labels:
gt_weights_list = labels[fields.InputDataFields.groundtruth_weights]
if fields.InputDataFields.groundtruth_is_crowd in labels:
gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd]
detection_model.provide_groundtruth(
......@@ -241,8 +244,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
groundtruth_classes_list=gt_classes_list,
groundtruth_masks_list=gt_masks_list,
groundtruth_keypoints_list=gt_keypoints_list,
groundtruth_weights_list=labels[
fields.InputDataFields.groundtruth_weights],
groundtruth_weights_list=gt_weights_list,
groundtruth_is_crowd_list=gt_is_crowd_list)
preprocessed_images = features[fields.InputDataFields.image]
......@@ -313,10 +315,16 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
# Optionally freeze some layers by setting their gradients to be zero.
trainable_variables = None
if train_config.freeze_variables:
trainable_variables = tf.contrib.framework.filter_variables(
tf.trainable_variables(),
exclude_patterns=train_config.freeze_variables)
include_variables = (
train_config.update_trainable_variables
if train_config.update_trainable_variables else None)
exclude_variables = (
train_config.freeze_variables
if train_config.freeze_variables else None)
trainable_variables = tf.contrib.framework.filter_variables(
tf.trainable_variables(),
include_patterns=include_variables,
exclude_patterns=exclude_variables)
clip_gradients_value = None
if train_config.gradient_clipping_by_norm > 0:
......@@ -377,14 +385,10 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
detection_and_groundtruth)
# Eval metrics on a single example.
eval_metrics = eval_config.metrics_set
if not eval_metrics:
eval_metrics = ['coco_detection_metrics']
eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
eval_metrics,
eval_config,
category_index.values(),
eval_dict,
include_metrics_per_category=eval_config.include_metrics_per_category)
eval_dict)
for loss_key, loss_tensor in iter(losses_dict.items()):
eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
for var in optimizer_summary_vars:
......
......@@ -178,6 +178,31 @@ class ModelLibTest(tf.test.TestCase):
configs = _get_configs_for_model(MODEL_NAME_FOR_TEST)
self._assert_model_fn_for_train_eval(configs, 'train')
def test_model_fn_in_train_mode_freeze_all_variables(self):
"""Tests model_fn TRAIN mode with all variables frozen."""
configs = _get_configs_for_model(MODEL_NAME_FOR_TEST)
configs['train_config'].freeze_variables.append('.*')
with self.assertRaisesRegexp(ValueError, 'No variables to optimize'):
self._assert_model_fn_for_train_eval(configs, 'train')
def test_model_fn_in_train_mode_freeze_all_included_variables(self):
"""Tests model_fn TRAIN mode with all included variables frozen."""
configs = _get_configs_for_model(MODEL_NAME_FOR_TEST)
train_config = configs['train_config']
train_config.update_trainable_variables.append('FeatureExtractor')
train_config.freeze_variables.append('.*')
with self.assertRaisesRegexp(ValueError, 'No variables to optimize'):
self._assert_model_fn_for_train_eval(configs, 'train')
def test_model_fn_in_train_mode_freeze_box_predictor(self):
"""Tests model_fn TRAIN mode with FeatureExtractor variables frozen."""
configs = _get_configs_for_model(MODEL_NAME_FOR_TEST)
train_config = configs['train_config']
train_config.update_trainable_variables.append('FeatureExtractor')
train_config.update_trainable_variables.append('BoxPredictor')
train_config.freeze_variables.append('FeatureExtractor')
self._assert_model_fn_for_train_eval(configs, 'train')
def test_model_fn_in_eval_mode(self):
"""Tests the model function in EVAL mode."""
configs = _get_configs_for_model(MODEL_NAME_FOR_TEST)
......
......@@ -18,6 +18,7 @@ import numpy as np
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import box_predictor_builder
from object_detection.builders import hyperparams_builder
from object_detection.predictors import convolutional_box_predictor as box_predictor
from object_detection.protos import hyperparams_pb2
......@@ -44,18 +45,18 @@ class ConvolutionalBoxPredictorTest(test_case.TestCase):
def test_get_boxes_for_five_aspect_ratios_per_location(self):
def graph_fn(image_features):
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
)
conv_box_predictor = (
box_predictor_builder.build_convolutional_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
......@@ -73,18 +74,18 @@ class ConvolutionalBoxPredictorTest(test_case.TestCase):
def test_get_boxes_for_one_aspect_ratio_per_location(self):
def graph_fn(image_features):
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
)
conv_box_predictor = (
box_predictor_builder.build_convolutional_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[1],
scope='BoxPredictor')
......@@ -104,18 +105,18 @@ class ConvolutionalBoxPredictorTest(test_case.TestCase):
num_classes_without_background = 6
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
def graph_fn(image_features):
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
)
conv_box_predictor = (
box_predictor_builder.build_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features],
num_predictions_per_location=[5],
......@@ -136,18 +137,18 @@ class ConvolutionalBoxPredictorTest(test_case.TestCase):
def test_get_predictions_with_feature_maps_of_dynamic_shape(
self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
)
conv_box_predictor = (
box_predictor_builder.build_convolutional_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
......@@ -183,19 +184,19 @@ class ConvolutionalBoxPredictorTest(test_case.TestCase):
def test_use_depthwise_convolution(self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4,
use_dropout=True,
use_depthwise=True
)
conv_box_predictor = (
box_predictor_builder.build_convolutional_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4,
use_dropout=True,
use_depthwise=True))
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
......@@ -278,13 +279,14 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
def test_get_boxes_for_five_aspect_ratios_per_location(self):
def graph_fn(image_features):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
......@@ -302,14 +304,15 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
def test_bias_predictions_to_background_with_sigmoid_score_conversion(self):
def graph_fn(image_features):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=True,
num_classes=2,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
class_prediction_bias_init=-4.6,
box_code_size=4)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=True,
num_classes=2,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
class_prediction_bias_init=-4.6,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
......@@ -325,13 +328,14 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
num_classes_without_background = 6
def graph_fn(image_features):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features],
num_predictions_per_location=[5],
......@@ -354,13 +358,14 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
......@@ -385,13 +390,14 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2, image_features3):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2, image_features3],
num_predictions_per_location=[5, 5, 5],
......@@ -416,13 +422,14 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
......@@ -482,14 +489,15 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
apply_batch_norm=False)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
apply_batch_norm=False))
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
......@@ -540,14 +548,15 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
def test_no_batchnorm_params_when_batchnorm_is_not_configured(self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_conv_arg_scope_no_batch_norm(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
apply_batch_norm=False)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_conv_arg_scope_no_batch_norm(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
apply_batch_norm=False))
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
......@@ -599,14 +608,15 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
share_prediction_tower=True)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
share_prediction_tower=True))
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
......@@ -653,15 +663,16 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
share_prediction_tower=True,
apply_batch_norm=False)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4,
share_prediction_tower=True,
apply_batch_norm=False))
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
......@@ -698,18 +709,20 @@ class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
'ClassPredictor/weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictor/biases')])
self.assertEqual(expected_variable_set, actual_variable_set)
def test_get_predictions_with_feature_maps_of_dynamic_shape(
self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4)
conv_box_predictor = (
box_predictor_builder.build_weight_shared_convolutional_box_predictor(
is_training=False,
num_classes=0,
conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4))
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
......
......@@ -13,16 +13,25 @@
# limitations under the License.
# ==============================================================================
"""Mask R-CNN Box Head."""
"""Box Head.
Contains Box prediction head classes for different meta architectures.
All the box prediction heads have a predict function that receives the
`features` as the first argument and returns `box_encodings`.
"""
import tensorflow as tf
from object_detection.predictors.mask_rcnn_heads import mask_rcnn_head
from object_detection.predictors.heads import head
slim = tf.contrib.slim
class BoxHead(mask_rcnn_head.MaskRCNNHead):
"""Mask RCNN box prediction head."""
class MaskRCNNBoxHead(head.Head):
"""Box prediction head.
Please refer to Mask RCNN paper:
https://arxiv.org/abs/1703.06870
"""
def __init__(self,
is_training,
......@@ -51,7 +60,7 @@ class BoxHead(mask_rcnn_head.MaskRCNNHead):
share_box_across_classes: Whether to share boxes across classes rather
than use a different box for each class.
"""
super(BoxHead, self).__init__()
super(MaskRCNNBoxHead, self).__init__()
self._is_training = is_training
self._num_classes = num_classes
self._fc_hyperparams_fn = fc_hyperparams_fn
......@@ -60,20 +69,27 @@ class BoxHead(mask_rcnn_head.MaskRCNNHead):
self._box_code_size = box_code_size
self._share_box_across_classes = share_box_across_classes
def _predict(self, roi_pooled_features):
def predict(self, features, num_predictions_per_location=1):
"""Predicts boxes.
Args:
roi_pooled_features: A float tensor of shape [batch_size, height, width,
features: A float tensor of shape [batch_size, height, width,
channels] containing features for a batch of images.
num_predictions_per_location: Int containing number of predictions per
location.
Returns:
box_encodings: A float tensor of shape
[batch_size, 1, num_classes, code_size] representing the location of the
objects.
Raises:
ValueError: If num_predictions_per_location is not 1.
"""
if num_predictions_per_location != 1:
raise ValueError('Only num_predictions_per_location=1 is supported')
spatial_averaged_roi_pooled_features = tf.reduce_mean(
roi_pooled_features, [1, 2], keep_dims=True, name='AvgPool')
features, [1, 2], keep_dims=True, name='AvgPool')
flattened_roi_pooled_features = slim.flatten(
spatial_averaged_roi_pooled_features)
if self._use_dropout:
......@@ -94,3 +110,130 @@ class BoxHead(mask_rcnn_head.MaskRCNNHead):
box_encodings = tf.reshape(box_encodings,
[-1, 1, number_of_boxes, self._box_code_size])
return box_encodings
class ConvolutionalBoxHead(head.Head):
"""Convolutional box prediction head."""
def __init__(self,
is_training,
box_code_size,
kernel_size,
use_depthwise=False):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
box_code_size: Size of encoding for each box.
kernel_size: Size of final convolution kernel. If the
spatial resolution of the feature map is smaller than the kernel size,
then the kernel size is automatically set to be
min(feature_width, feature_height).
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalBoxHead, self).__init__()
self._is_training = is_training
self._box_code_size = box_code_size
self._kernel_size = kernel_size
self._use_depthwise = use_depthwise
def predict(self, features, num_predictions_per_location):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
num_predictions_per_location: Number of box predictions to be made per
spatial location. Int specifying number of boxes per location.
Returns:
box_encodings: A float tensors of shape
[batch_size, num_anchors, q, code_size] representing the location of
the objects, where q is 1 or the number of classes.
"""
net = features
if self._use_depthwise:
box_encodings = slim.separable_conv2d(
net, None, [self._kernel_size, self._kernel_size],
padding='SAME', depth_multiplier=1, stride=1,
rate=1, scope='BoxEncodingPredictor_depthwise')
box_encodings = slim.conv2d(
box_encodings,
num_predictions_per_location * self._box_code_size, [1, 1],
activation_fn=None,
normalizer_fn=None,
normalizer_params=None,
scope='BoxEncodingPredictor')
else:
box_encodings = slim.conv2d(
net, num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
activation_fn=None,
normalizer_fn=None,
normalizer_params=None,
scope='BoxEncodingPredictor')
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
box_encodings = tf.reshape(box_encodings,
[batch_size, -1, 1, self._box_code_size])
return box_encodings
# TODO(alirezafathi): See if possible to unify Weight Shared with regular
# convolutional box head.
class WeightSharedConvolutionalBoxHead(head.Head):
"""Weight shared convolutional box prediction head.
This head allows sharing the same set of parameters (weights) when called more
then once on different feature maps.
"""
def __init__(self,
box_code_size,
kernel_size=3,
class_prediction_bias_init=0.0):
"""Constructor.
Args:
box_code_size: Size of encoding for each box.
kernel_size: Size of final convolution kernel.
class_prediction_bias_init: constant value to initialize bias of the last
conv2d layer before class prediction.
"""
super(WeightSharedConvolutionalBoxHead, self).__init__()
self._box_code_size = box_code_size
self._kernel_size = kernel_size
def predict(self, features, num_predictions_per_location):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
num_predictions_per_location: Number of box predictions to be made per
spatial location.
Returns:
box_encodings: A float tensor of shape
[batch_size, num_anchors, code_size] representing the location of
the objects.
"""
box_encodings_net = features
box_encodings = slim.conv2d(
box_encodings_net,
num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
activation_fn=None, stride=1, padding='SAME',
normalizer_fn=None,
scope='BoxPredictor')
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
box_encodings = tf.reshape(box_encodings,
[batch_size, -1, self._box_code_size])
return box_encodings
......@@ -13,17 +13,17 @@
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.mask_rcnn_heads.box_head."""
"""Tests for object_detection.predictors.heads.box_head."""
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import hyperparams_builder
from object_detection.predictors.mask_rcnn_heads import box_head
from object_detection.predictors.heads import box_head
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class BoxHeadTest(test_case.TestCase):
class MaskRCNNBoxHeadTest(test_case.TestCase):
def _build_arg_scope_with_hyperparams(self,
op_type=hyperparams_pb2.Hyperparams.FC):
......@@ -44,7 +44,7 @@ class BoxHeadTest(test_case.TestCase):
return hyperparams_builder.build(hyperparams, is_training=True)
def test_prediction_size(self):
box_prediction_head = box_head.BoxHead(
box_prediction_head = box_head.MaskRCNNBoxHead(
is_training=False,
num_classes=20,
fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(),
......@@ -55,10 +55,73 @@ class BoxHeadTest(test_case.TestCase):
roi_pooled_features = tf.random_uniform(
[64, 7, 7, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
prediction = box_prediction_head.predict(
roi_pooled_features=roi_pooled_features)
tf.logging.info(prediction.shape)
features=roi_pooled_features, num_predictions_per_location=1)
self.assertAllEqual([64, 1, 20, 4], prediction.get_shape().as_list())
class ConvolutionalBoxPredictorTest(test_case.TestCase):
def _build_arg_scope_with_hyperparams(
self, op_type=hyperparams_pb2.Hyperparams.CONV):
hyperparams = hyperparams_pb2.Hyperparams()
hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(hyperparams_text_proto, hyperparams)
hyperparams.op = op_type
return hyperparams_builder.build(hyperparams, is_training=True)
def test_prediction_size(self):
box_prediction_head = box_head.ConvolutionalBoxHead(
is_training=True,
box_code_size=4,
kernel_size=3)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
box_encodings = box_prediction_head.predict(
features=image_feature,
num_predictions_per_location=1)
self.assertAllEqual([64, 323, 1, 4], box_encodings.get_shape().as_list())
class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
def _build_arg_scope_with_hyperparams(
self, op_type=hyperparams_pb2.Hyperparams.CONV):
hyperparams = hyperparams_pb2.Hyperparams()
hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(hyperparams_text_proto, hyperparams)
hyperparams.op = op_type
return hyperparams_builder.build(hyperparams, is_training=True)
def test_prediction_size(self):
box_prediction_head = box_head.WeightSharedConvolutionalBoxHead(
box_code_size=4)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
box_encodings = box_prediction_head.predict(
features=image_feature,
num_predictions_per_location=1)
self.assertAllEqual([64, 323, 4], box_encodings.get_shape().as_list())
if __name__ == '__main__':
tf.test.main()
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Class Head.
Contains Class prediction head classes for different meta architectures.
All the class prediction heads have a predict function that receives the
`features` as the first argument and returns class predictions with background.
"""
import tensorflow as tf
from object_detection.predictors.heads import head
slim = tf.contrib.slim
class MaskRCNNClassHead(head.Head):
"""Mask RCNN class prediction head.
Please refer to Mask RCNN paper:
https://arxiv.org/abs/1703.06870
"""
def __init__(self, is_training, num_classes, fc_hyperparams_fn,
use_dropout, dropout_keep_prob):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: number of classes. Note that num_classes *does not*
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
fc_hyperparams_fn: A function to generate tf-slim arg_scope with
hyperparameters for fully connected ops.
use_dropout: Option to use dropout or not. Note that a single dropout
op is applied here prior to both box and class predictions, which stands
in contrast to the ConvolutionalBoxPredictor below.
dropout_keep_prob: Keep probability for dropout.
This is only used if use_dropout is True.
"""
super(MaskRCNNClassHead, self).__init__()
self._is_training = is_training
self._num_classes = num_classes
self._fc_hyperparams_fn = fc_hyperparams_fn
self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob
def predict(self, features, num_predictions_per_location=1):
"""Predicts boxes and class scores.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing features for a batch of images.
num_predictions_per_location: Int containing number of predictions per
location.
Returns:
class_predictions_with_background: A float tensor of shape
[batch_size, 1, num_classes + 1] representing the class predictions for
the proposals.
Raises:
ValueError: If num_predictions_per_location is not 1.
"""
if num_predictions_per_location != 1:
raise ValueError('Only num_predictions_per_location=1 is supported')
spatial_averaged_roi_pooled_features = tf.reduce_mean(
features, [1, 2], keep_dims=True, name='AvgPool')
flattened_roi_pooled_features = slim.flatten(
spatial_averaged_roi_pooled_features)
if self._use_dropout:
flattened_roi_pooled_features = slim.dropout(
flattened_roi_pooled_features,
keep_prob=self._dropout_keep_prob,
is_training=self._is_training)
with slim.arg_scope(self._fc_hyperparams_fn()):
class_predictions_with_background = slim.fully_connected(
flattened_roi_pooled_features,
self._num_classes + 1,
activation_fn=None,
scope='ClassPredictor')
class_predictions_with_background = tf.reshape(
class_predictions_with_background, [-1, 1, self._num_classes + 1])
return class_predictions_with_background
class ConvolutionalClassHead(head.Head):
"""Convolutional class prediction head."""
def __init__(self,
is_training,
num_classes,
use_dropout,
dropout_keep_prob,
kernel_size,
apply_sigmoid_to_scores=False,
class_prediction_bias_init=0.0,
use_depthwise=False):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: Number of classes.
use_dropout: Option to use dropout or not. Note that a single dropout
op is applied here prior to both box and class predictions, which stands
in contrast to the ConvolutionalBoxPredictor below.
dropout_keep_prob: Keep probability for dropout.
This is only used if use_dropout is True.
kernel_size: Size of final convolution kernel. If the
spatial resolution of the feature map is smaller than the kernel size,
then the kernel size is automatically set to be
min(feature_width, feature_height).
apply_sigmoid_to_scores: if True, apply the sigmoid on the output
class_predictions.
class_prediction_bias_init: constant value to initialize bias of the last
conv2d layer before class prediction.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalClassHead, self).__init__()
self._is_training = is_training
self._num_classes = num_classes
self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob
self._kernel_size = kernel_size
self._apply_sigmoid_to_scores = apply_sigmoid_to_scores
self._class_prediction_bias_init = class_prediction_bias_init
self._use_depthwise = use_depthwise
def predict(self, features, num_predictions_per_location):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
num_predictions_per_location: Number of box predictions to be made per
spatial location.
Returns:
class_predictions_with_background: A float tensors of shape
[batch_size, num_anchors, num_classes + 1] representing the class
predictions for the proposals.
"""
net = features
# Add a slot for the background class.
num_class_slots = self._num_classes + 1
if self._use_dropout:
net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
if self._use_depthwise:
class_predictions_with_background = slim.separable_conv2d(
net, None, [self._kernel_size, self._kernel_size],
padding='SAME', depth_multiplier=1, stride=1,
rate=1, scope='ClassPredictor_depthwise')
class_predictions_with_background = slim.conv2d(
class_predictions_with_background,
num_predictions_per_location * num_class_slots, [1, 1],
activation_fn=None,
normalizer_fn=None,
normalizer_params=None,
scope='ClassPredictor')
else:
class_predictions_with_background = slim.conv2d(
net,
num_predictions_per_location * num_class_slots,
[self._kernel_size, self._kernel_size],
activation_fn=None,
normalizer_fn=None,
normalizer_params=None,
scope='ClassPredictor',
biases_initializer=tf.constant_initializer(
self._class_prediction_bias_init))
if self._apply_sigmoid_to_scores:
class_predictions_with_background = tf.sigmoid(
class_predictions_with_background)
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
class_predictions_with_background = tf.reshape(
class_predictions_with_background, [batch_size, -1, num_class_slots])
return class_predictions_with_background
# TODO(alirezafathi): See if possible to unify Weight Shared with regular
# convolutional class head.
class WeightSharedConvolutionalClassHead(head.Head):
"""Weight shared convolutional class prediction head.
This head allows sharing the same set of parameters (weights) when called more
then once on different feature maps.
"""
def __init__(self,
num_classes,
kernel_size=3,
class_prediction_bias_init=0.0,
use_dropout=False,
dropout_keep_prob=0.8):
"""Constructor.
Args:
num_classes: number of classes. Note that num_classes *does not*
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
kernel_size: Size of final convolution kernel.
class_prediction_bias_init: constant value to initialize bias of the last
conv2d layer before class prediction.
use_dropout: Whether to apply dropout to class prediction head.
dropout_keep_prob: Probability of keeping activiations.
"""
super(WeightSharedConvolutionalClassHead, self).__init__()
self._num_classes = num_classes
self._kernel_size = kernel_size
self._class_prediction_bias_init = class_prediction_bias_init
self._use_dropout = use_dropout
self._dropout_keep_prob = dropout_keep_prob
def predict(self, features, num_predictions_per_location):
"""Predicts boxes.
Args:
features: A float tensor of shape [batch_size, height, width, channels]
containing image features.
num_predictions_per_location: Number of box predictions to be made per
spatial location.
Returns:
class_predictions_with_background: A tensor of shape
[batch_size, num_anchors, num_classes + 1] representing the class
predictions for the proposals.
"""
class_predictions_net = features
num_class_slots = self._num_classes + 1
# Add a slot for the background class.
if self._use_dropout:
class_predictions_net = slim.dropout(
class_predictions_net, keep_prob=self._dropout_keep_prob)
class_predictions_with_background = slim.conv2d(
class_predictions_net,
num_predictions_per_location * num_class_slots,
[self._kernel_size, self._kernel_size],
activation_fn=None, stride=1, padding='SAME',
normalizer_fn=None,
biases_initializer=tf.constant_initializer(
self._class_prediction_bias_init),
scope='ClassPredictor')
batch_size = features.get_shape().as_list()[0]
if batch_size is None:
batch_size = tf.shape(features)[0]
class_predictions_with_background = tf.reshape(
class_predictions_with_background, [batch_size, -1, num_class_slots])
return class_predictions_with_background
......@@ -13,17 +13,17 @@
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.predictors.mask_rcnn_heads.class_head."""
"""Tests for object_detection.predictors.heads.class_head."""
import tensorflow as tf
from google.protobuf import text_format
from object_detection.builders import hyperparams_builder
from object_detection.predictors.mask_rcnn_heads import class_head
from object_detection.predictors.heads import class_head
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class ClassHeadTest(test_case.TestCase):
class MaskRCNNClassHeadTest(test_case.TestCase):
def _build_arg_scope_with_hyperparams(self,
op_type=hyperparams_pb2.Hyperparams.FC):
......@@ -44,7 +44,7 @@ class ClassHeadTest(test_case.TestCase):
return hyperparams_builder.build(hyperparams, is_training=True)
def test_prediction_size(self):
class_prediction_head = class_head.ClassHead(
class_prediction_head = class_head.MaskRCNNClassHead(
is_training=False,
num_classes=20,
fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(),
......@@ -53,10 +53,76 @@ class ClassHeadTest(test_case.TestCase):
roi_pooled_features = tf.random_uniform(
[64, 7, 7, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
prediction = class_prediction_head.predict(
roi_pooled_features=roi_pooled_features)
tf.logging.info(prediction.shape)
features=roi_pooled_features, num_predictions_per_location=1)
self.assertAllEqual([64, 1, 21], prediction.get_shape().as_list())
class ConvolutionalClassPredictorTest(test_case.TestCase):
def _build_arg_scope_with_hyperparams(
self, op_type=hyperparams_pb2.Hyperparams.CONV):
hyperparams = hyperparams_pb2.Hyperparams()
hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(hyperparams_text_proto, hyperparams)
hyperparams.op = op_type
return hyperparams_builder.build(hyperparams, is_training=True)
def test_prediction_size(self):
class_prediction_head = class_head.ConvolutionalClassHead(
is_training=True,
num_classes=20,
use_dropout=True,
dropout_keep_prob=0.5,
kernel_size=3)
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
class_predictions = class_prediction_head.predict(
features=image_feature,
num_predictions_per_location=1)
self.assertAllEqual([64, 323, 21],
class_predictions.get_shape().as_list())
class WeightSharedConvolutionalClassPredictorTest(test_case.TestCase):
def _build_arg_scope_with_hyperparams(
self, op_type=hyperparams_pb2.Hyperparams.CONV):
hyperparams = hyperparams_pb2.Hyperparams()
hyperparams_text_proto = """
activation: NONE
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(hyperparams_text_proto, hyperparams)
hyperparams.op = op_type
return hyperparams_builder.build(hyperparams, is_training=True)
def test_prediction_size(self):
class_prediction_head = (
class_head.WeightSharedConvolutionalClassHead(num_classes=20))
image_feature = tf.random_uniform(
[64, 17, 19, 1024], minval=-10.0, maxval=10.0, dtype=tf.float32)
class_predictions = class_prediction_head.predict(
features=image_feature,
num_predictions_per_location=1)
self.assertAllEqual([64, 323, 21], class_predictions.get_shape().as_list())
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment