Commit e00e0e13 authored by dreamdragon's avatar dreamdragon
Browse files

Merge remote-tracking branch 'upstream/master'

parents b915db4e 402b561b
...@@ -124,6 +124,8 @@ def transform_input_data(tensor_dict, ...@@ -124,6 +124,8 @@ def transform_input_data(tensor_dict,
if fields.InputDataFields.groundtruth_instance_masks in tensor_dict: if fields.InputDataFields.groundtruth_instance_masks in tensor_dict:
masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks] masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks]
_, resized_masks, _ = image_resizer_fn(image, masks) _, resized_masks, _ = image_resizer_fn(image, masks)
if use_bfloat16:
resized_masks = tf.cast(resized_masks, tf.bfloat16)
tensor_dict[fields.InputDataFields. tensor_dict[fields.InputDataFields.
groundtruth_instance_masks] = resized_masks groundtruth_instance_masks] = resized_masks
...@@ -161,6 +163,9 @@ def transform_input_data(tensor_dict, ...@@ -161,6 +163,9 @@ def transform_input_data(tensor_dict,
tensor_dict[fields.InputDataFields.groundtruth_classes] = merged_classes tensor_dict[fields.InputDataFields.groundtruth_classes] = merged_classes
tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( tensor_dict[fields.InputDataFields.groundtruth_confidences] = (
merged_confidences) merged_confidences)
if fields.InputDataFields.groundtruth_boxes in tensor_dict:
tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = tf.shape(
tensor_dict[fields.InputDataFields.groundtruth_boxes])[0]
return tensor_dict return tensor_dict
...@@ -282,12 +287,9 @@ def augment_input_data(tensor_dict, data_augmentation_options): ...@@ -282,12 +287,9 @@ def augment_input_data(tensor_dict, data_augmentation_options):
in tensor_dict) in tensor_dict)
include_keypoints = (fields.InputDataFields.groundtruth_keypoints include_keypoints = (fields.InputDataFields.groundtruth_keypoints
in tensor_dict) in tensor_dict)
include_label_scores = (fields.InputDataFields.groundtruth_confidences in
tensor_dict)
tensor_dict = preprocessor.preprocess( tensor_dict = preprocessor.preprocess(
tensor_dict, data_augmentation_options, tensor_dict, data_augmentation_options,
func_arg_map=preprocessor.get_default_func_arg_map( func_arg_map=preprocessor.get_default_func_arg_map(
include_label_scores=include_label_scores,
include_instance_masks=include_instance_masks, include_instance_masks=include_instance_masks,
include_keypoints=include_keypoints)) include_keypoints=include_keypoints))
tensor_dict[fields.InputDataFields.image] = tf.squeeze( tensor_dict[fields.InputDataFields.image] = tf.squeeze(
......
...@@ -630,6 +630,9 @@ class DataTransformationFnTest(test_case.TestCase): ...@@ -630,6 +630,9 @@ class DataTransformationFnTest(test_case.TestCase):
self.assertAllClose( self.assertAllClose(
transformed_inputs[fields.InputDataFields.groundtruth_confidences], transformed_inputs[fields.InputDataFields.groundtruth_confidences],
[[1, 0, 1]]) [[1, 0, 1]])
self.assertAllClose(
transformed_inputs[fields.InputDataFields.num_groundtruth_boxes],
1)
def test_returns_resized_masks(self): def test_returns_resized_masks(self):
tensor_dict = { tensor_dict = {
......
...@@ -160,6 +160,17 @@ class FakeDetectionModel(model.DetectionModel): ...@@ -160,6 +160,17 @@ class FakeDetectionModel(model.DetectionModel):
} }
return loss_dict return loss_dict
def regularization_losses(self):
"""Returns a list of regularization losses for this model.
Returns a list of regularization losses for this model that the estimator
needs to use during training/optimization.
Returns:
A list of regularization loss tensors.
"""
pass
def restore_map(self, fine_tune_checkpoint_type='detection'): def restore_map(self, fine_tune_checkpoint_type='detection'):
"""Returns a map of variables to load from a foreign checkpoint. """Returns a map of variables to load from a foreign checkpoint.
...@@ -174,6 +185,18 @@ class FakeDetectionModel(model.DetectionModel): ...@@ -174,6 +185,18 @@ class FakeDetectionModel(model.DetectionModel):
""" """
return {var.op.name: var for var in tf.global_variables()} return {var.op.name: var for var in tf.global_variables()}
def updates(self):
"""Returns a list of update operators for this model.
Returns a list of update operators for this model that must be executed at
each training step. The estimator's train op needs to have a control
dependency on these updates.
Returns:
A list of update operators.
"""
pass
class TrainerTest(tf.test.TestCase): class TrainerTest(tf.test.TestCase):
......
...@@ -662,7 +662,8 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -662,7 +662,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
anchors_boxlist, clip_window) anchors_boxlist, clip_window)
else: else:
anchors_boxlist = box_list_ops.clip_to_window( anchors_boxlist = box_list_ops.clip_to_window(
anchors_boxlist, clip_window) anchors_boxlist, clip_window,
filter_nonoverlapping=not self._use_static_shapes)
self._anchors = anchors_boxlist self._anchors = anchors_boxlist
prediction_dict = { prediction_dict = {
...@@ -917,12 +918,14 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -917,12 +918,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
_, num_classes, mask_height, mask_width = ( _, num_classes, mask_height, mask_width = (
detection_masks.get_shape().as_list()) detection_masks.get_shape().as_list())
_, max_detection = detection_classes.get_shape().as_list() _, max_detection = detection_classes.get_shape().as_list()
prediction_dict['mask_predictions'] = tf.reshape(
detection_masks, [-1, num_classes, mask_height, mask_width])
if num_classes > 1: if num_classes > 1:
detection_masks = self._gather_instance_masks( detection_masks = self._gather_instance_masks(
detection_masks, detection_classes) detection_masks, detection_classes)
prediction_dict[fields.DetectionResultFields.detection_masks] = ( prediction_dict[fields.DetectionResultFields.detection_masks] = (
tf.reshape(detection_masks, tf.reshape(tf.sigmoid(detection_masks),
[batch_size, max_detection, mask_height, mask_width])) [batch_size, max_detection, mask_height, mask_width]))
return prediction_dict return prediction_dict
...@@ -1159,9 +1162,9 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -1159,9 +1162,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
} }
# TODO(jrru): Remove mask_predictions from _post_process_box_classifier. # TODO(jrru): Remove mask_predictions from _post_process_box_classifier.
with tf.name_scope('SecondStagePostprocessor'): if (self._number_of_stages == 2 or
if (self._number_of_stages == 2 or (self._number_of_stages == 3 and self._is_training)):
(self._number_of_stages == 3 and self._is_training)): with tf.name_scope('SecondStagePostprocessor'):
mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS) mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS)
detections_dict = self._postprocess_box_classifier( detections_dict = self._postprocess_box_classifier(
prediction_dict['refined_box_encodings'], prediction_dict['refined_box_encodings'],
...@@ -1170,18 +1173,53 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -1170,18 +1173,53 @@ class FasterRCNNMetaArch(model.DetectionModel):
prediction_dict['num_proposals'], prediction_dict['num_proposals'],
true_image_shapes, true_image_shapes,
mask_predictions=mask_predictions) mask_predictions=mask_predictions)
return detections_dict
if 'rpn_features_to_crop' in prediction_dict and self._initial_crop_size:
self._add_detection_features_output_node(
detections_dict[fields.DetectionResultFields.detection_boxes],
prediction_dict['rpn_features_to_crop'])
return detections_dict
if self._number_of_stages == 3: if self._number_of_stages == 3:
# Post processing is already performed in 3rd stage. We need to transfer # Post processing is already performed in 3rd stage. We need to transfer
# postprocessed tensors from `prediction_dict` to `detections_dict`. # postprocessed tensors from `prediction_dict` to `detections_dict`.
detections_dict = {} return prediction_dict
for key in prediction_dict:
if key == fields.DetectionResultFields.detection_masks: def _add_detection_features_output_node(self, detection_boxes,
detections_dict[key] = tf.sigmoid(prediction_dict[key]) rpn_features_to_crop):
elif 'detection' in key: """Add the detection features to the output node.
detections_dict[key] = prediction_dict[key]
return detections_dict The detection features are from cropping rpn_features with boxes.
Each bounding box has one feature vector of length depth, which comes from
mean_pooling of the cropped rpn_features.
Args:
detection_boxes: a 3-D float32 tensor of shape
[batch_size, max_detection, 4] which represents the bounding boxes.
rpn_features_to_crop: A 4-D float32 tensor with shape
[batch, height, width, depth] representing image features to crop using
the proposals boxes.
"""
with tf.name_scope('SecondStageDetectionFeaturesExtract'):
flattened_detected_feature_maps = (
self._compute_second_stage_input_feature_maps(
rpn_features_to_crop, detection_boxes))
detection_features_unpooled = (
self._feature_extractor.extract_box_classifier_features(
flattened_detected_feature_maps,
scope=self.second_stage_feature_extractor_scope))
batch_size = tf.shape(detection_boxes)[0]
max_detection = tf.shape(detection_boxes)[1]
detection_features_pool = tf.reduce_mean(
detection_features_unpooled, axis=[1, 2])
detection_features = tf.reshape(
detection_features_pool,
[batch_size, max_detection, tf.shape(detection_features_pool)[-1]])
detection_features = tf.identity(
detection_features, 'detection_features')
def _postprocess_rpn(self, def _postprocess_rpn(self,
rpn_box_encodings_batch, rpn_box_encodings_batch,
...@@ -1454,6 +1492,7 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -1454,6 +1492,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
# to cls_weights. This could happen as boxes within certain IOU ranges # to cls_weights. This could happen as boxes within certain IOU ranges
# are ignored. If triggered, the selected boxes will still be ignored # are ignored. If triggered, the selected boxes will still be ignored
# during loss computation. # during loss computation.
cls_weights = tf.reduce_mean(cls_weights, axis=-1)
positive_indicator = tf.greater(tf.argmax(cls_targets, axis=1), 0) positive_indicator = tf.greater(tf.argmax(cls_targets, axis=1), 0)
valid_indicator = tf.logical_and( valid_indicator = tf.logical_and(
tf.range(proposal_boxlist.num_boxes()) < num_valid_proposals, tf.range(proposal_boxlist.num_boxes()) < num_valid_proposals,
...@@ -1566,6 +1605,7 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -1566,6 +1605,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
mask_predictions_batch = tf.reshape( mask_predictions_batch = tf.reshape(
mask_predictions, [-1, self.max_num_proposals, mask_predictions, [-1, self.max_num_proposals,
self.num_classes, mask_height, mask_width]) self.num_classes, mask_height, mask_width])
(nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, _, (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, _,
num_detections) = self._second_stage_nms_fn( num_detections) = self._second_stage_nms_fn(
refined_decoded_boxes_batch, refined_decoded_boxes_batch,
...@@ -1713,6 +1753,7 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -1713,6 +1753,7 @@ class FasterRCNNMetaArch(model.DetectionModel):
gt_box_batch=groundtruth_boxlists, gt_box_batch=groundtruth_boxlists,
gt_class_targets_batch=(len(groundtruth_boxlists) * [None]), gt_class_targets_batch=(len(groundtruth_boxlists) * [None]),
gt_weights_batch=groundtruth_weights_list) gt_weights_batch=groundtruth_weights_list)
batch_cls_weights = tf.reduce_mean(batch_cls_weights, axis=2)
batch_cls_targets = tf.squeeze(batch_cls_targets, axis=2) batch_cls_targets = tf.squeeze(batch_cls_targets, axis=2)
def _minibatch_subsample_fn(inputs): def _minibatch_subsample_fn(inputs):
...@@ -1743,7 +1784,8 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -1743,7 +1784,8 @@ class FasterRCNNMetaArch(model.DetectionModel):
losses_mask=losses_mask) losses_mask=losses_mask)
objectness_losses = self._first_stage_objectness_loss( objectness_losses = self._first_stage_objectness_loss(
rpn_objectness_predictions_with_background, rpn_objectness_predictions_with_background,
batch_one_hot_targets, weights=batch_sampled_indices, batch_one_hot_targets,
weights=tf.expand_dims(batch_sampled_indices, axis=-1),
losses_mask=losses_mask) losses_mask=losses_mask)
localization_loss = tf.reduce_mean( localization_loss = tf.reduce_mean(
tf.reduce_sum(localization_losses, axis=1) / normalizer) tf.reduce_sum(localization_losses, axis=1) / normalizer)
...@@ -1960,25 +2002,28 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -1960,25 +2002,28 @@ class FasterRCNNMetaArch(model.DetectionModel):
tf.expand_dims(flat_gt_masks, -1), tf.expand_dims(flat_gt_masks, -1),
tf.expand_dims(flat_normalized_proposals, axis=1), tf.expand_dims(flat_normalized_proposals, axis=1),
[mask_height, mask_width]) [mask_height, mask_width])
# Without stopping gradients into cropped groundtruth masks the
# performance with 100-padded groundtruth masks when batch size > 1 is
# about 4% worse.
# TODO(rathodv): Investigate this since we don't expect any variables
# upstream of flat_cropped_gt_mask.
flat_cropped_gt_mask = tf.stop_gradient(flat_cropped_gt_mask)
batch_cropped_gt_mask = tf.reshape( batch_cropped_gt_mask = tf.reshape(
flat_cropped_gt_mask, flat_cropped_gt_mask,
[batch_size, -1, mask_height * mask_width]) [batch_size, -1, mask_height * mask_width])
second_stage_mask_losses = ops.reduce_sum_trailing_dimensions( mask_losses_weights = (
self._second_stage_mask_loss( batch_mask_target_weights * tf.to_float(paddings_indicator))
reshaped_prediction_masks, mask_losses = self._second_stage_mask_loss(
batch_cropped_gt_mask, reshaped_prediction_masks,
weights=batch_mask_target_weights, batch_cropped_gt_mask,
losses_mask=losses_mask), weights=tf.expand_dims(mask_losses_weights, axis=-1),
ndims=2) / ( losses_mask=losses_mask)
mask_height * mask_width * tf.maximum( total_mask_loss = tf.reduce_sum(mask_losses)
tf.reduce_sum( normalizer = tf.maximum(
batch_mask_target_weights, axis=1, keep_dims=True tf.reduce_sum(mask_losses_weights * mask_height * mask_width), 1.0)
), tf.ones((batch_size, 1)))) second_stage_mask_loss = total_mask_loss / normalizer
second_stage_mask_loss = tf.reduce_sum(
tf.where(paddings_indicator, second_stage_mask_losses,
tf.zeros_like(second_stage_mask_losses)))
if second_stage_mask_loss is not None: if second_stage_mask_loss is not None:
mask_loss = tf.multiply(self._second_stage_mask_loss_weight, mask_loss = tf.multiply(self._second_stage_mask_loss_weight,
...@@ -2073,6 +2118,17 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -2073,6 +2118,17 @@ class FasterRCNNMetaArch(model.DetectionModel):
cls_losses=tf.expand_dims(single_image_cls_loss, 0), cls_losses=tf.expand_dims(single_image_cls_loss, 0),
decoded_boxlist_list=[proposal_boxlist]) decoded_boxlist_list=[proposal_boxlist])
def regularization_losses(self):
"""Returns a list of regularization losses for this model.
Returns a list of regularization losses for this model that the estimator
needs to use during training/optimization.
Returns:
A list of regularization loss tensors.
"""
return tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
def restore_map(self, def restore_map(self,
fine_tune_checkpoint_type='detection', fine_tune_checkpoint_type='detection',
load_all_detection_checkpoint_vars=False): load_all_detection_checkpoint_vars=False):
...@@ -2117,3 +2173,16 @@ class FasterRCNNMetaArch(model.DetectionModel): ...@@ -2117,3 +2173,16 @@ class FasterRCNNMetaArch(model.DetectionModel):
feature_extractor_variables = tf.contrib.framework.filter_variables( feature_extractor_variables = tf.contrib.framework.filter_variables(
variables_to_restore, include_patterns=include_patterns) variables_to_restore, include_patterns=include_patterns)
return {var.op.name: var for var in feature_extractor_variables} return {var.op.name: var for var in feature_extractor_variables}
def updates(self):
"""Returns a list of update operators for this model.
Returns a list of update operators for this model that must be executed at
each training step. The estimator's train op needs to have a control
dependency on these updates.
Returns:
A list of update operators.
"""
return tf.get_collection(tf.GraphKeys.UPDATE_OPS)
...@@ -189,7 +189,7 @@ class FasterRCNNMetaArchTest( ...@@ -189,7 +189,7 @@ class FasterRCNNMetaArchTest(
set(expected_shapes.keys()).union( set(expected_shapes.keys()).union(
set([ set([
'detection_boxes', 'detection_scores', 'detection_classes', 'detection_boxes', 'detection_scores', 'detection_classes',
'detection_masks', 'num_detections' 'detection_masks', 'num_detections', 'mask_predictions',
]))) ])))
for key in expected_shapes: for key in expected_shapes:
self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key]) self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
...@@ -199,6 +199,9 @@ class FasterRCNNMetaArchTest( ...@@ -199,6 +199,9 @@ class FasterRCNNMetaArchTest(
self.assertAllEqual(tensor_dict_out['detection_classes'].shape, [2, 5]) self.assertAllEqual(tensor_dict_out['detection_classes'].shape, [2, 5])
self.assertAllEqual(tensor_dict_out['detection_scores'].shape, [2, 5]) self.assertAllEqual(tensor_dict_out['detection_scores'].shape, [2, 5])
self.assertAllEqual(tensor_dict_out['num_detections'].shape, [2]) self.assertAllEqual(tensor_dict_out['num_detections'].shape, [2])
num_classes = 1 if masks_are_class_agnostic else 2
self.assertAllEqual(tensor_dict_out['mask_predictions'].shape,
[10, num_classes, 14, 14])
@parameterized.parameters( @parameterized.parameters(
{'masks_are_class_agnostic': False}, {'masks_are_class_agnostic': False},
......
...@@ -250,6 +250,7 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -250,6 +250,7 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
iou_threshold: 1.0 iou_threshold: 1.0
max_detections_per_class: 5 max_detections_per_class: 5
max_total_detections: 5 max_total_detections: 5
use_static_shapes: """ +'{}'.format(use_static_shapes) + """
} }
""" """
post_processing_config = post_processing_pb2.PostProcessing() post_processing_config = post_processing_pb2.PostProcessing()
...@@ -336,61 +337,71 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -336,61 +337,71 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
masks_are_class_agnostic=masks_are_class_agnostic), **common_kwargs) masks_are_class_agnostic=masks_are_class_agnostic), **common_kwargs)
def test_predict_gives_correct_shapes_in_inference_mode_first_stage_only( def test_predict_gives_correct_shapes_in_inference_mode_first_stage_only(
self): self, use_static_shapes=False):
test_graph = tf.Graph() batch_size = 2
with test_graph.as_default(): height = 10
model = self._build_model( width = 12
is_training=False, number_of_stages=1, second_stage_batch_size=2) input_image_shape = (batch_size, height, width, 3)
batch_size = 2
height = 10
width = 12
input_image_shape = (batch_size, height, width, 3)
_, true_image_shapes = model.preprocess(tf.zeros(input_image_shape)) def graph_fn(images):
preprocessed_inputs = tf.placeholder( """Function to construct tf graph for the test."""
dtype=tf.float32, shape=(batch_size, None, None, 3)) model = self._build_model(
is_training=False,
number_of_stages=1,
second_stage_batch_size=2,
clip_anchors_to_image=use_static_shapes,
use_static_shapes=use_static_shapes)
preprocessed_inputs, true_image_shapes = model.preprocess(images)
prediction_dict = model.predict(preprocessed_inputs, true_image_shapes) prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
return (prediction_dict['rpn_box_predictor_features'],
prediction_dict['rpn_features_to_crop'],
prediction_dict['image_shape'],
prediction_dict['rpn_box_encodings'],
prediction_dict['rpn_objectness_predictions_with_background'],
prediction_dict['anchors'])
images = np.zeros(input_image_shape, dtype=np.float32)
# In inference mode, anchors are clipped to the image window, but not
# pruned. Since MockFasterRCNN.extract_proposal_features returns a
# tensor with the same shape as its input, the expected number of anchors
# is height * width * the number of anchors per location (i.e. 3x3).
expected_num_anchors = height * width * 3 * 3
expected_output_shapes = {
'rpn_box_predictor_features': (batch_size, height, width, 512),
'rpn_features_to_crop': (batch_size, height, width, 3),
'rpn_box_encodings': (batch_size, expected_num_anchors, 4),
'rpn_objectness_predictions_with_background':
(batch_size, expected_num_anchors, 2),
'anchors': (expected_num_anchors, 4)
}
# In inference mode, anchors are clipped to the image window, but not if use_static_shapes:
# pruned. Since MockFasterRCNN.extract_proposal_features returns a results = self.execute(graph_fn, [images])
# tensor with the same shape as its input, the expected number of anchors else:
# is height * width * the number of anchors per location (i.e. 3x3). results = self.execute_cpu(graph_fn, [images])
expected_num_anchors = height * width * 3 * 3
expected_output_keys = set([
'rpn_box_predictor_features', 'rpn_features_to_crop', 'image_shape',
'rpn_box_encodings', 'rpn_objectness_predictions_with_background',
'anchors'])
expected_output_shapes = {
'rpn_box_predictor_features': (batch_size, height, width, 512),
'rpn_features_to_crop': (batch_size, height, width, 3),
'rpn_box_encodings': (batch_size, expected_num_anchors, 4),
'rpn_objectness_predictions_with_background':
(batch_size, expected_num_anchors, 2),
'anchors': (expected_num_anchors, 4)
}
init_op = tf.global_variables_initializer()
with self.test_session(graph=test_graph) as sess:
sess.run(init_op)
prediction_out = sess.run(prediction_dict,
feed_dict={
preprocessed_inputs:
np.zeros(input_image_shape)
})
self.assertEqual(set(prediction_out.keys()), expected_output_keys)
self.assertAllEqual(prediction_out['image_shape'], input_image_shape) self.assertAllEqual(results[0].shape,
for output_key, expected_shape in expected_output_shapes.items(): expected_output_shapes['rpn_box_predictor_features'])
self.assertAllEqual(prediction_out[output_key].shape, expected_shape) self.assertAllEqual(results[1].shape,
expected_output_shapes['rpn_features_to_crop'])
self.assertAllEqual(results[2],
input_image_shape)
self.assertAllEqual(results[3].shape,
expected_output_shapes['rpn_box_encodings'])
self.assertAllEqual(
results[4].shape,
expected_output_shapes['rpn_objectness_predictions_with_background'])
self.assertAllEqual(results[5].shape,
expected_output_shapes['anchors'])
# Check that anchors are clipped to window. # Check that anchors are clipped to window.
anchors = prediction_out['anchors'] anchors = results[5]
self.assertTrue(np.all(np.greater_equal(anchors, 0))) self.assertTrue(np.all(np.greater_equal(anchors, 0)))
self.assertTrue(np.all(np.less_equal(anchors[:, 0], height))) self.assertTrue(np.all(np.less_equal(anchors[:, 0], height)))
self.assertTrue(np.all(np.less_equal(anchors[:, 1], width))) self.assertTrue(np.all(np.less_equal(anchors[:, 1], width)))
self.assertTrue(np.all(np.less_equal(anchors[:, 2], height))) self.assertTrue(np.all(np.less_equal(anchors[:, 2], height)))
self.assertTrue(np.all(np.less_equal(anchors[:, 3], width))) self.assertTrue(np.all(np.less_equal(anchors[:, 3], width)))
def test_predict_gives_valid_anchors_in_training_mode_first_stage_only(self): def test_predict_gives_valid_anchors_in_training_mode_first_stage_only(self):
test_graph = tf.Graph() test_graph = tf.Graph()
...@@ -446,7 +457,38 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -446,7 +457,38 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
prediction_out['rpn_objectness_predictions_with_background'].shape, prediction_out['rpn_objectness_predictions_with_background'].shape,
(batch_size, num_anchors_out, 2)) (batch_size, num_anchors_out, 2))
def test_predict_correct_shapes_in_inference_mode_two_stages(self): def test_predict_correct_shapes_in_inference_mode_two_stages(
self, use_static_shapes=False):
def compare_results(results, expected_output_shapes):
"""Checks if the shape of the predictions are as expected."""
self.assertAllEqual(results[0].shape,
expected_output_shapes['rpn_box_predictor_features'])
self.assertAllEqual(results[1].shape,
expected_output_shapes['rpn_features_to_crop'])
self.assertAllEqual(results[2].shape,
expected_output_shapes['image_shape'])
self.assertAllEqual(results[3].shape,
expected_output_shapes['rpn_box_encodings'])
self.assertAllEqual(
results[4].shape,
expected_output_shapes['rpn_objectness_predictions_with_background'])
self.assertAllEqual(results[5].shape,
expected_output_shapes['anchors'])
self.assertAllEqual(results[6].shape,
expected_output_shapes['refined_box_encodings'])
self.assertAllEqual(
results[7].shape,
expected_output_shapes['class_predictions_with_background'])
self.assertAllEqual(results[8].shape,
expected_output_shapes['num_proposals'])
self.assertAllEqual(results[9].shape,
expected_output_shapes['proposal_boxes'])
self.assertAllEqual(results[10].shape,
expected_output_shapes['proposal_boxes_normalized'])
self.assertAllEqual(results[11].shape,
expected_output_shapes['box_classifier_features'])
batch_size = 2 batch_size = 2
image_size = 10 image_size = 10
max_num_proposals = 8 max_num_proposals = 8
...@@ -457,6 +499,32 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -457,6 +499,32 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
(None, image_size, image_size, 3), (None, image_size, image_size, 3),
(batch_size, None, None, 3), (batch_size, None, None, 3),
(None, None, None, 3)] (None, None, None, 3)]
def graph_fn_tpu(images):
"""Function to construct tf graph for the test."""
model = self._build_model(
is_training=False,
number_of_stages=2,
second_stage_batch_size=2,
predict_masks=False,
use_matmul_crop_and_resize=use_static_shapes,
clip_anchors_to_image=use_static_shapes,
use_static_shapes=use_static_shapes)
preprocessed_inputs, true_image_shapes = model.preprocess(images)
prediction_dict = model.predict(preprocessed_inputs, true_image_shapes)
return (prediction_dict['rpn_box_predictor_features'],
prediction_dict['rpn_features_to_crop'],
prediction_dict['image_shape'],
prediction_dict['rpn_box_encodings'],
prediction_dict['rpn_objectness_predictions_with_background'],
prediction_dict['anchors'],
prediction_dict['refined_box_encodings'],
prediction_dict['class_predictions_with_background'],
prediction_dict['num_proposals'],
prediction_dict['proposal_boxes'],
prediction_dict['proposal_boxes_normalized'],
prediction_dict['box_classifier_features'])
expected_num_anchors = image_size * image_size * 3 * 3 expected_num_anchors = image_size * image_size * 3 * 3
expected_shapes = { expected_shapes = {
'rpn_box_predictor_features': 'rpn_box_predictor_features':
...@@ -481,28 +549,34 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -481,28 +549,34 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
3) 3)
} }
for input_shape in input_shapes: if use_static_shapes:
test_graph = tf.Graph() input_shape = (batch_size, image_size, image_size, 3)
with test_graph.as_default(): images = np.zeros(input_shape, dtype=np.float32)
model = self._build_model( results = self.execute(graph_fn_tpu, [images])
is_training=False, compare_results(results, expected_shapes)
number_of_stages=2, else:
second_stage_batch_size=2, for input_shape in input_shapes:
predict_masks=False) test_graph = tf.Graph()
preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape) with test_graph.as_default():
_, true_image_shapes = model.preprocess(preprocessed_inputs) model = self._build_model(
result_tensor_dict = model.predict( is_training=False,
preprocessed_inputs, true_image_shapes) number_of_stages=2,
init_op = tf.global_variables_initializer() second_stage_batch_size=2,
with self.test_session(graph=test_graph) as sess: predict_masks=False)
sess.run(init_op) preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape)
tensor_dict_out = sess.run(result_tensor_dict, feed_dict={ _, true_image_shapes = model.preprocess(preprocessed_inputs)
preprocessed_inputs: result_tensor_dict = model.predict(
np.zeros((batch_size, image_size, image_size, 3))}) preprocessed_inputs, true_image_shapes)
self.assertEqual(set(tensor_dict_out.keys()), init_op = tf.global_variables_initializer()
set(expected_shapes.keys())) with self.test_session(graph=test_graph) as sess:
for key in expected_shapes: sess.run(init_op)
self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key]) tensor_dict_out = sess.run(result_tensor_dict, feed_dict={
preprocessed_inputs:
np.zeros((batch_size, image_size, image_size, 3))})
self.assertEqual(set(tensor_dict_out.keys()),
set(expected_shapes.keys()))
for key in expected_shapes:
self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key])
def test_predict_gives_correct_shapes_in_train_mode_both_stages( def test_predict_gives_correct_shapes_in_train_mode_both_stages(
self, self,
...@@ -596,23 +670,46 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -596,23 +670,46 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
self.assertAllEqual(results[8].shape, self.assertAllEqual(results[8].shape,
expected_shapes['rpn_box_predictor_features']) expected_shapes['rpn_box_predictor_features'])
def _test_postprocess_first_stage_only_inference_mode( def test_postprocess_first_stage_only_inference_mode(
self, pad_to_max_dimension=None): self, use_static_shapes=False, pad_to_max_dimension=None):
model = self._build_model(
is_training=False, number_of_stages=1, second_stage_batch_size=6,
pad_to_max_dimension=pad_to_max_dimension)
batch_size = 2 batch_size = 2
anchors = tf.constant( first_stage_max_proposals = 4 if use_static_shapes else 8
def graph_fn(images,
rpn_box_encodings,
rpn_objectness_predictions_with_background,
rpn_features_to_crop,
anchors):
"""Function to construct tf graph for the test."""
model = self._build_model(
is_training=False, number_of_stages=1, second_stage_batch_size=6,
use_matmul_crop_and_resize=use_static_shapes,
clip_anchors_to_image=use_static_shapes,
use_static_shapes=use_static_shapes,
use_matmul_gather_in_matcher=use_static_shapes,
first_stage_max_proposals=first_stage_max_proposals,
pad_to_max_dimension=pad_to_max_dimension)
_, true_image_shapes = model.preprocess(images)
proposals = model.postprocess({
'rpn_box_encodings': rpn_box_encodings,
'rpn_objectness_predictions_with_background':
rpn_objectness_predictions_with_background,
'rpn_features_to_crop': rpn_features_to_crop,
'anchors': anchors}, true_image_shapes)
return (proposals['num_detections'],
proposals['detection_boxes'],
proposals['detection_scores'])
anchors = np.array(
[[0, 0, 16, 16], [[0, 0, 16, 16],
[0, 16, 16, 32], [0, 16, 16, 32],
[16, 0, 32, 16], [16, 0, 32, 16],
[16, 16, 32, 32]], dtype=tf.float32) [16, 16, 32, 32]], dtype=np.float32)
rpn_box_encodings = tf.zeros( rpn_box_encodings = np.zeros(
[batch_size, anchors.get_shape().as_list()[0], (batch_size, anchors.shape[0], BOX_CODE_SIZE), dtype=np.float32)
BOX_CODE_SIZE], dtype=tf.float32)
# use different numbers for the objectness category to break ties in # use different numbers for the objectness category to break ties in
# order of boxes returned by NMS # order of boxes returned by NMS
rpn_objectness_predictions_with_background = tf.constant([ rpn_objectness_predictions_with_background = np.array([
[[-10, 13], [[-10, 13],
[10, -10], [10, -10],
[10, -11], [10, -11],
...@@ -620,16 +717,22 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -620,16 +717,22 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
[[10, -10], [[10, -10],
[-10, 13], [-10, 13],
[-10, 12], [-10, 12],
[10, -11]]], dtype=tf.float32) [10, -11]]], dtype=np.float32)
rpn_features_to_crop = tf.ones((batch_size, 8, 8, 10), dtype=tf.float32) rpn_features_to_crop = np.ones((batch_size, 8, 8, 10), dtype=np.float32)
image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) image_shape = (batch_size, 32, 32, 3)
_, true_image_shapes = model.preprocess(tf.zeros(image_shape)) images = np.zeros(image_shape, dtype=np.float32)
proposals = model.postprocess({
'rpn_box_encodings': rpn_box_encodings, if use_static_shapes:
'rpn_objectness_predictions_with_background': results = self.execute(graph_fn,
rpn_objectness_predictions_with_background, [images, rpn_box_encodings,
'rpn_features_to_crop': rpn_features_to_crop, rpn_objectness_predictions_with_background,
'anchors': anchors}, true_image_shapes) rpn_features_to_crop, anchors])
else:
results = self.execute_cpu(graph_fn,
[images, rpn_box_encodings,
rpn_objectness_predictions_with_background,
rpn_features_to_crop, anchors])
expected_proposal_boxes = [ expected_proposal_boxes = [
[[0, 0, .5, .5], [.5, .5, 1, 1], [0, .5, .5, 1], [.5, 0, 1.0, .5]] [[0, 0, .5, .5], [.5, .5, 1, 1], [0, .5, .5, 1], [.5, 0, 1.0, .5]]
+ 4 * [4 * [0]], + 4 * [4 * [0]],
...@@ -639,24 +742,12 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -639,24 +742,12 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
[1, 1, 0, 0, 0, 0, 0, 0]] [1, 1, 0, 0, 0, 0, 0, 0]]
expected_num_proposals = [4, 4] expected_num_proposals = [4, 4]
expected_output_keys = set(['detection_boxes', 'detection_scores', self.assertAllClose(results[0], expected_num_proposals)
'num_detections']) for indx, num_proposals in enumerate(expected_num_proposals):
self.assertEqual(set(proposals.keys()), expected_output_keys) self.assertAllClose(results[1][indx][0:num_proposals],
with self.test_session() as sess: expected_proposal_boxes[indx][0:num_proposals])
proposals_out = sess.run(proposals) self.assertAllClose(results[2][indx][0:num_proposals],
self.assertAllClose(proposals_out['detection_boxes'], expected_proposal_scores[indx][0:num_proposals])
expected_proposal_boxes)
self.assertAllClose(proposals_out['detection_scores'],
expected_proposal_scores)
self.assertAllEqual(proposals_out['num_detections'],
expected_num_proposals)
def test_postprocess_first_stage_only_inference_mode(self):
self._test_postprocess_first_stage_only_inference_mode()
def test_postprocess_first_stage_only_inference_mode_padded_image(self):
self._test_postprocess_first_stage_only_inference_mode(
pad_to_max_dimension=56)
def _test_postprocess_first_stage_only_train_mode(self, def _test_postprocess_first_stage_only_train_mode(self,
pad_to_max_dimension=None): pad_to_max_dimension=None):
...@@ -733,83 +824,80 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase): ...@@ -733,83 +824,80 @@ class FasterRCNNMetaArchTestBase(test_case.TestCase, parameterized.TestCase):
def test_postprocess_first_stage_only_train_mode_padded_image(self): def test_postprocess_first_stage_only_train_mode_padded_image(self):
self._test_postprocess_first_stage_only_train_mode(pad_to_max_dimension=56) self._test_postprocess_first_stage_only_train_mode(pad_to_max_dimension=56)
def _test_postprocess_second_stage_only_inference_mode( def test_postprocess_second_stage_only_inference_mode(
self, pad_to_max_dimension=None): self, use_static_shapes=False, pad_to_max_dimension=None):
num_proposals_shapes = [(2), (None,)]
refined_box_encodings_shapes = [(16, 2, 4), (None, 2, 4)]
class_predictions_with_background_shapes = [(16, 3), (None, 3)]
proposal_boxes_shapes = [(2, 8, 4), (None, 8, 4)]
batch_size = 2 batch_size = 2
num_classes = 2
image_shape = np.array((2, 36, 48, 3), dtype=np.int32) image_shape = np.array((2, 36, 48, 3), dtype=np.int32)
for (num_proposals_shape, refined_box_encoding_shape, first_stage_max_proposals = 8
class_predictions_with_background_shape, total_num_padded_proposals = batch_size * first_stage_max_proposals
proposal_boxes_shape) in zip(num_proposals_shapes,
refined_box_encodings_shapes, def graph_fn(images,
class_predictions_with_background_shapes, refined_box_encodings,
proposal_boxes_shapes): class_predictions_with_background,
tf_graph = tf.Graph() num_proposals,
with tf_graph.as_default(): proposal_boxes):
model = self._build_model( """Function to construct tf graph for the test."""
is_training=False, number_of_stages=2, model = self._build_model(
second_stage_batch_size=6, is_training=False, number_of_stages=2,
pad_to_max_dimension=pad_to_max_dimension) second_stage_batch_size=6,
_, true_image_shapes = model.preprocess(tf.zeros(image_shape)) use_matmul_crop_and_resize=use_static_shapes,
total_num_padded_proposals = batch_size * model.max_num_proposals clip_anchors_to_image=use_static_shapes,
proposal_boxes = np.array( use_static_shapes=use_static_shapes,
[[[1, 1, 2, 3], use_matmul_gather_in_matcher=use_static_shapes,
[0, 0, 1, 1], pad_to_max_dimension=pad_to_max_dimension)
[.5, .5, .6, .6], _, true_image_shapes = model.preprocess(images)
4*[0], 4*[0], 4*[0], 4*[0], 4*[0]], detections = model.postprocess({
[[2, 3, 6, 8], 'refined_box_encodings': refined_box_encodings,
[1, 2, 5, 3], 'class_predictions_with_background':
4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]]) class_predictions_with_background,
num_proposals = np.array([3, 2], dtype=np.int32) 'num_proposals': num_proposals,
refined_box_encodings = np.zeros( 'proposal_boxes': proposal_boxes,
[total_num_padded_proposals, model.num_classes, 4]) }, true_image_shapes)
class_predictions_with_background = np.ones( return (detections['num_detections'],
[total_num_padded_proposals, model.num_classes+1]) detections['detection_boxes'],
detections['detection_scores'],
num_proposals_placeholder = tf.placeholder(tf.int32, detections['detection_classes'])
shape=num_proposals_shape)
refined_box_encodings_placeholder = tf.placeholder( proposal_boxes = np.array(
tf.float32, shape=refined_box_encoding_shape) [[[1, 1, 2, 3],
class_predictions_with_background_placeholder = tf.placeholder( [0, 0, 1, 1],
tf.float32, shape=class_predictions_with_background_shape) [.5, .5, .6, .6],
proposal_boxes_placeholder = tf.placeholder( 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]],
tf.float32, shape=proposal_boxes_shape) [[2, 3, 6, 8],
image_shape_placeholder = tf.placeholder(tf.int32, shape=(4)) [1, 2, 5, 3],
4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]], dtype=np.float32)
detections = model.postprocess({ num_proposals = np.array([3, 2], dtype=np.int32)
'refined_box_encodings': refined_box_encodings_placeholder, refined_box_encodings = np.zeros(
'class_predictions_with_background': [total_num_padded_proposals, num_classes, 4], dtype=np.float32)
class_predictions_with_background_placeholder, class_predictions_with_background = np.ones(
'num_proposals': num_proposals_placeholder, [total_num_padded_proposals, num_classes+1], dtype=np.float32)
'proposal_boxes': proposal_boxes_placeholder, images = np.zeros(image_shape, dtype=np.float32)
}, true_image_shapes)
with self.test_session(graph=tf_graph) as sess: if use_static_shapes:
detections_out = sess.run( results = self.execute(graph_fn,
detections, [images, refined_box_encodings,
feed_dict={ class_predictions_with_background,
refined_box_encodings_placeholder: refined_box_encodings, num_proposals, proposal_boxes])
class_predictions_with_background_placeholder: else:
class_predictions_with_background, results = self.execute_cpu(graph_fn,
num_proposals_placeholder: num_proposals, [images, refined_box_encodings,
proposal_boxes_placeholder: proposal_boxes, class_predictions_with_background,
image_shape_placeholder: image_shape num_proposals, proposal_boxes])
}) expected_num_detections = [5, 4]
self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4]) expected_detection_classes = [[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]]
self.assertAllClose(detections_out['detection_scores'], expected_detection_scores = [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])
self.assertAllClose(detections_out['detection_classes'], self.assertAllClose(results[0], expected_num_detections)
[[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]])
self.assertAllClose(detections_out['num_detections'], [5, 4]) for indx, num_proposals in enumerate(expected_num_detections):
self.assertAllClose(results[2][indx][0:num_proposals],
def test_postprocess_second_stage_only_inference_mode(self): expected_detection_scores[indx][0:num_proposals])
self._test_postprocess_second_stage_only_inference_mode() self.assertAllClose(results[3][indx][0:num_proposals],
expected_detection_classes[indx][0:num_proposals])
def test_postprocess_second_stage_only_inference_mode_padded_image(self):
self._test_postprocess_second_stage_only_inference_mode( if not use_static_shapes:
pad_to_max_dimension=56) self.assertAllEqual(results[1].shape, [2, 5, 4])
def test_preprocess_preserves_input_shapes(self): def test_preprocess_preserves_input_shapes(self):
image_shapes = [(3, None, None, 3), image_shapes = [(3, None, None, 3),
......
...@@ -19,7 +19,6 @@ models. ...@@ -19,7 +19,6 @@ models.
""" """
from abc import abstractmethod from abc import abstractmethod
import re
import tensorflow as tf import tensorflow as tf
from object_detection.core import box_list from object_detection.core import box_list
...@@ -116,6 +115,25 @@ class SSDFeatureExtractor(object): ...@@ -116,6 +115,25 @@ class SSDFeatureExtractor(object):
""" """
raise NotImplementedError raise NotImplementedError
def restore_from_classification_checkpoint_fn(self, feature_extractor_scope):
"""Returns a map of variables to load from a foreign checkpoint.
Args:
feature_extractor_scope: A scope name for the feature extractor.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
the model graph.
"""
variables_to_restore = {}
for variable in tf.global_variables():
var_name = variable.op.name
if var_name.startswith(feature_extractor_scope + '/'):
var_name = var_name.replace(feature_extractor_scope + '/', '')
variables_to_restore[var_name] = variable
return variables_to_restore
class SSDKerasFeatureExtractor(tf.keras.Model): class SSDKerasFeatureExtractor(tf.keras.Model):
"""SSD Feature Extractor definition.""" """SSD Feature Extractor definition."""
...@@ -218,6 +236,25 @@ class SSDKerasFeatureExtractor(tf.keras.Model): ...@@ -218,6 +236,25 @@ class SSDKerasFeatureExtractor(tf.keras.Model):
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
return self._extract_features(inputs) return self._extract_features(inputs)
def restore_from_classification_checkpoint_fn(self, feature_extractor_scope):
"""Returns a map of variables to load from a foreign checkpoint.
Args:
feature_extractor_scope: A scope name for the feature extractor.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
the model graph.
"""
variables_to_restore = {}
for variable in tf.global_variables():
var_name = variable.op.name
if var_name.startswith(feature_extractor_scope + '/'):
var_name = var_name.replace(feature_extractor_scope + '/', '')
variables_to_restore[var_name] = variable
return variables_to_restore
class SSDMetaArch(model.DetectionModel): class SSDMetaArch(model.DetectionModel):
"""SSD Meta-architecture definition.""" """SSD Meta-architecture definition."""
...@@ -333,13 +370,15 @@ class SSDMetaArch(model.DetectionModel): ...@@ -333,13 +370,15 @@ class SSDMetaArch(model.DetectionModel):
# Slim feature extractors get an explicit naming scope # Slim feature extractors get an explicit naming scope
self._extract_features_scope = 'FeatureExtractor' self._extract_features_scope = 'FeatureExtractor'
# TODO(jonathanhuang): handle agnostic mode if self._add_background_class and encode_background_as_zeros:
# weights
self._unmatched_class_label = tf.constant([1] + self.num_classes * [0],
tf.float32)
if encode_background_as_zeros:
self._unmatched_class_label = tf.constant((self.num_classes + 1) * [0], self._unmatched_class_label = tf.constant((self.num_classes + 1) * [0],
tf.float32) tf.float32)
elif self._add_background_class:
self._unmatched_class_label = tf.constant([1] + self.num_classes * [0],
tf.float32)
else:
self._unmatched_class_label = tf.constant(self.num_classes * [0],
tf.float32)
self._target_assigner = target_assigner_instance self._target_assigner = target_assigner_instance
...@@ -606,14 +645,22 @@ class SSDMetaArch(model.DetectionModel): ...@@ -606,14 +645,22 @@ class SSDMetaArch(model.DetectionModel):
detection_boxes = tf.identity(detection_boxes, 'raw_box_locations') detection_boxes = tf.identity(detection_boxes, 'raw_box_locations')
detection_boxes = tf.expand_dims(detection_boxes, axis=2) detection_boxes = tf.expand_dims(detection_boxes, axis=2)
detection_scores_with_background = self._score_conversion_fn( detection_scores = self._score_conversion_fn(class_predictions)
class_predictions) detection_scores = tf.identity(detection_scores, 'raw_box_scores')
detection_scores_with_background = tf.identity( if self._add_background_class:
detection_scores_with_background, 'raw_box_scores') detection_scores = tf.slice(detection_scores, [0, 0, 1], [-1, -1, -1])
detection_scores = tf.slice(detection_scores_with_background, [0, 0, 1],
[-1, -1, -1])
additional_fields = None additional_fields = None
batch_size = (
shape_utils.combined_static_and_dynamic_shape(preprocessed_images)[0])
if 'feature_maps' in prediction_dict:
feature_map_list = []
for feature_map in prediction_dict['feature_maps']:
feature_map_list.append(tf.reshape(feature_map, [batch_size, -1]))
box_features = tf.concat(feature_map_list, 1)
box_features = tf.identity(box_features, 'raw_box_features')
if detection_keypoints is not None: if detection_keypoints is not None:
additional_fields = { additional_fields = {
fields.BoxListFields.keypoints: detection_keypoints} fields.BoxListFields.keypoints: detection_keypoints}
...@@ -683,17 +730,20 @@ class SSDMetaArch(model.DetectionModel): ...@@ -683,17 +730,20 @@ class SSDMetaArch(model.DetectionModel):
self.groundtruth_lists(fields.BoxListFields.boxes), match_list) self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
if self._random_example_sampler: if self._random_example_sampler:
batch_cls_per_anchor_weights = tf.reduce_mean(
batch_cls_weights, axis=-1)
batch_sampled_indicator = tf.to_float( batch_sampled_indicator = tf.to_float(
shape_utils.static_or_dynamic_map_fn( shape_utils.static_or_dynamic_map_fn(
self._minibatch_subsample_fn, self._minibatch_subsample_fn,
[batch_cls_targets, batch_cls_weights], [batch_cls_targets, batch_cls_per_anchor_weights],
dtype=tf.bool, dtype=tf.bool,
parallel_iterations=self._parallel_iterations, parallel_iterations=self._parallel_iterations,
back_prop=True)) back_prop=True))
batch_reg_weights = tf.multiply(batch_sampled_indicator, batch_reg_weights = tf.multiply(batch_sampled_indicator,
batch_reg_weights) batch_reg_weights)
batch_cls_weights = tf.multiply(batch_sampled_indicator, batch_cls_weights = tf.multiply(
batch_cls_weights) tf.expand_dims(batch_sampled_indicator, -1),
batch_cls_weights)
losses_mask = None losses_mask = None
if self.groundtruth_has_field(fields.InputDataFields.is_annotated): if self.groundtruth_has_field(fields.InputDataFields.is_annotated):
...@@ -713,16 +763,32 @@ class SSDMetaArch(model.DetectionModel): ...@@ -713,16 +763,32 @@ class SSDMetaArch(model.DetectionModel):
losses_mask=losses_mask) losses_mask=losses_mask)
if self._expected_classification_loss_under_sampling: if self._expected_classification_loss_under_sampling:
# Need to compute losses for assigned targets against the
# unmatched_class_label as well as their assigned targets.
# simplest thing (but wasteful) is just to calculate all losses
# twice
batch_size, num_anchors, num_classes = batch_cls_targets.get_shape()
unmatched_targets = tf.ones([batch_size, num_anchors, 1
]) * self._unmatched_class_label
unmatched_cls_losses = self._classification_loss(
prediction_dict['class_predictions_with_background'],
unmatched_targets,
weights=batch_cls_weights,
losses_mask=losses_mask)
if cls_losses.get_shape().ndims == 3: if cls_losses.get_shape().ndims == 3:
batch_size, num_anchors, num_classes = cls_losses.get_shape() batch_size, num_anchors, num_classes = cls_losses.get_shape()
cls_losses = tf.reshape(cls_losses, [batch_size, -1]) cls_losses = tf.reshape(cls_losses, [batch_size, -1])
unmatched_cls_losses = tf.reshape(unmatched_cls_losses,
[batch_size, -1])
batch_cls_targets = tf.reshape( batch_cls_targets = tf.reshape(
batch_cls_targets, [batch_size, num_anchors * num_classes, -1]) batch_cls_targets, [batch_size, num_anchors * num_classes, -1])
batch_cls_targets = tf.concat( batch_cls_targets = tf.concat(
[1 - batch_cls_targets, batch_cls_targets], axis=-1) [1 - batch_cls_targets, batch_cls_targets], axis=-1)
cls_losses = self._expected_classification_loss_under_sampling( cls_losses = self._expected_classification_loss_under_sampling(
batch_cls_targets, cls_losses) batch_cls_targets, cls_losses, unmatched_cls_losses)
classification_loss = tf.reduce_sum(cls_losses) classification_loss = tf.reduce_sum(cls_losses)
localization_loss = tf.reduce_sum(location_losses) localization_loss = tf.reduce_sum(location_losses)
...@@ -971,6 +1037,26 @@ class SSDMetaArch(model.DetectionModel): ...@@ -971,6 +1037,26 @@ class SSDMetaArch(model.DetectionModel):
[combined_shape[0], combined_shape[1], 4])) [combined_shape[0], combined_shape[1], 4]))
return decoded_boxes, decoded_keypoints return decoded_boxes, decoded_keypoints
def regularization_losses(self):
"""Returns a list of regularization losses for this model.
Returns a list of regularization losses for this model that the estimator
needs to use during training/optimization.
Returns:
A list of regularization loss tensors.
"""
losses = []
slim_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
# Copy the slim losses to avoid modifying the collection
if slim_losses:
losses.extend(slim_losses)
if self._box_predictor.is_keras_model:
losses.extend(self._box_predictor.losses)
if self._feature_extractor.is_keras_model:
losses.extend(self._feature_extractor.losses)
return losses
def restore_map(self, def restore_map(self,
fine_tune_checkpoint_type='detection', fine_tune_checkpoint_type='detection',
load_all_detection_checkpoint_vars=False): load_all_detection_checkpoint_vars=False):
...@@ -997,18 +1083,44 @@ class SSDMetaArch(model.DetectionModel): ...@@ -997,18 +1083,44 @@ class SSDMetaArch(model.DetectionModel):
if fine_tune_checkpoint_type not in ['detection', 'classification']: if fine_tune_checkpoint_type not in ['detection', 'classification']:
raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format( raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
fine_tune_checkpoint_type)) fine_tune_checkpoint_type))
variables_to_restore = {}
for variable in tf.global_variables(): if fine_tune_checkpoint_type == 'classification':
var_name = variable.op.name return self._feature_extractor.restore_from_classification_checkpoint_fn(
if (fine_tune_checkpoint_type == 'detection' and self._extract_features_scope)
load_all_detection_checkpoint_vars):
variables_to_restore[var_name] = variable if fine_tune_checkpoint_type == 'detection':
else: variables_to_restore = {}
if var_name.startswith(self._extract_features_scope): for variable in tf.global_variables():
if fine_tune_checkpoint_type == 'classification': var_name = variable.op.name
var_name = ( if load_all_detection_checkpoint_vars:
re.split('^' + self._extract_features_scope + '/',
var_name)[-1])
variables_to_restore[var_name] = variable variables_to_restore[var_name] = variable
else:
if var_name.startswith(self._extract_features_scope):
variables_to_restore[var_name] = variable
return variables_to_restore return variables_to_restore
def updates(self):
"""Returns a list of update operators for this model.
Returns a list of update operators for this model that must be executed at
each training step. The estimator's train op needs to have a control
dependency on these updates.
Returns:
A list of update operators.
"""
update_ops = []
slim_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
# Copy the slim ops to avoid modifying the collection
if slim_update_ops:
update_ops.extend(slim_update_ops)
if self._box_predictor.is_keras_model:
update_ops.extend(self._box_predictor.get_updates_for(None))
update_ops.extend(self._box_predictor.get_updates_for(
self._box_predictor.inputs))
if self._feature_extractor.is_keras_model:
update_ops.extend(self._feature_extractor.get_updates_for(None))
update_ops.extend(self._feature_extractor.get_updates_for(
self._feature_extractor.inputs))
return update_ops
...@@ -42,7 +42,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase, ...@@ -42,7 +42,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
random_example_sampling=False, random_example_sampling=False,
weight_regression_loss_by_score=False, weight_regression_loss_by_score=False,
use_expected_classification_loss_under_sampling=False, use_expected_classification_loss_under_sampling=False,
minimum_negative_sampling=1, min_num_negative_samples=1,
desired_negative_sampling_ratio=3, desired_negative_sampling_ratio=3,
use_keras=False, use_keras=False,
predict_mask=False, predict_mask=False,
...@@ -57,7 +57,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase, ...@@ -57,7 +57,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
weight_regression_loss_by_score=weight_regression_loss_by_score, weight_regression_loss_by_score=weight_regression_loss_by_score,
use_expected_classification_loss_under_sampling= use_expected_classification_loss_under_sampling=
use_expected_classification_loss_under_sampling, use_expected_classification_loss_under_sampling,
minimum_negative_sampling=minimum_negative_sampling, min_num_negative_samples=min_num_negative_samples,
desired_negative_sampling_ratio=desired_negative_sampling_ratio, desired_negative_sampling_ratio=desired_negative_sampling_ratio,
use_keras=use_keras, use_keras=use_keras,
predict_mask=predict_mask, predict_mask=predict_mask,
...@@ -344,11 +344,11 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase, ...@@ -344,11 +344,11 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32) preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32) groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32)
groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32) groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32)
groundtruth_classes1 = np.array([[0, 1]], dtype=np.float32) groundtruth_classes1 = np.array([[1]], dtype=np.float32)
groundtruth_classes2 = np.array([[0, 1]], dtype=np.float32) groundtruth_classes2 = np.array([[1]], dtype=np.float32)
expected_localization_loss = 0.0 expected_localization_loss = 0.0
expected_classification_loss = ( expected_classification_loss = (
batch_size * num_anchors * (num_classes + 1) * np.log(2.0)) batch_size * num_anchors * num_classes * np.log(2.0))
(localization_loss, classification_loss) = self.execute( (localization_loss, classification_loss) = self.execute(
graph_fn, [ graph_fn, [
preprocessed_input, groundtruth_boxes1, groundtruth_boxes2, preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,
...@@ -371,7 +371,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase, ...@@ -371,7 +371,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
apply_hard_mining=False, apply_hard_mining=False,
add_background_class=True, add_background_class=True,
use_expected_classification_loss_under_sampling=True, use_expected_classification_loss_under_sampling=True,
minimum_negative_sampling=1, min_num_negative_samples=1,
desired_negative_sampling_ratio=desired_negative_sampling_ratio) desired_negative_sampling_ratio=desired_negative_sampling_ratio)
model.provide_groundtruth(groundtruth_boxes_list, model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list) groundtruth_classes_list)
...@@ -391,8 +391,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase, ...@@ -391,8 +391,7 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
expected_localization_loss = 0.0 expected_localization_loss = 0.0
expected_classification_loss = ( expected_classification_loss = (
batch_size * (desired_negative_sampling_ratio * num_anchors + batch_size * (num_anchors + num_classes * num_anchors) * np.log(2.0))
num_classes * num_anchors) * np.log(2.0))
(localization_loss, classification_loss) = self.execute( (localization_loss, classification_loss) = self.execute(
graph_fn, [ graph_fn, [
preprocessed_input, groundtruth_boxes1, groundtruth_boxes2, preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,
...@@ -432,11 +431,11 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase, ...@@ -432,11 +431,11 @@ class SsdMetaArchTest(ssd_meta_arch_test_lib.SSDMetaArchTestBase,
preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32) preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32)
groundtruth_boxes1 = np.array([[0, 0, 1, 1]], dtype=np.float32) groundtruth_boxes1 = np.array([[0, 0, 1, 1]], dtype=np.float32)
groundtruth_boxes2 = np.array([[0, 0, 1, 1]], dtype=np.float32) groundtruth_boxes2 = np.array([[0, 0, 1, 1]], dtype=np.float32)
groundtruth_classes1 = np.array([[0, 1]], dtype=np.float32) groundtruth_classes1 = np.array([[1]], dtype=np.float32)
groundtruth_classes2 = np.array([[1, 0]], dtype=np.float32) groundtruth_classes2 = np.array([[0]], dtype=np.float32)
expected_localization_loss = 0.25 expected_localization_loss = 0.25
expected_classification_loss = ( expected_classification_loss = (
batch_size * num_anchors * (num_classes + 1) * np.log(2.0)) batch_size * num_anchors * num_classes * np.log(2.0))
(localization_loss, classification_loss) = self.execute( (localization_loss, classification_loss) = self.execute(
graph_fn, [ graph_fn, [
preprocessed_input, groundtruth_boxes1, groundtruth_boxes2, preprocessed_input, groundtruth_boxes1, groundtruth_boxes2,
......
...@@ -119,7 +119,7 @@ class SSDMetaArchTestBase(test_case.TestCase): ...@@ -119,7 +119,7 @@ class SSDMetaArchTestBase(test_case.TestCase):
random_example_sampling=False, random_example_sampling=False,
weight_regression_loss_by_score=False, weight_regression_loss_by_score=False,
use_expected_classification_loss_under_sampling=False, use_expected_classification_loss_under_sampling=False,
minimum_negative_sampling=1, min_num_negative_samples=1,
desired_negative_sampling_ratio=3, desired_negative_sampling_ratio=3,
use_keras=False, use_keras=False,
predict_mask=False, predict_mask=False,
...@@ -130,10 +130,12 @@ class SSDMetaArchTestBase(test_case.TestCase): ...@@ -130,10 +130,12 @@ class SSDMetaArchTestBase(test_case.TestCase):
mock_anchor_generator = MockAnchorGenerator2x2() mock_anchor_generator = MockAnchorGenerator2x2()
if use_keras: if use_keras:
mock_box_predictor = test_utils.MockKerasBoxPredictor( mock_box_predictor = test_utils.MockKerasBoxPredictor(
is_training, num_classes, predict_mask=predict_mask) is_training, num_classes, add_background_class=add_background_class,
predict_mask=predict_mask)
else: else:
mock_box_predictor = test_utils.MockBoxPredictor( mock_box_predictor = test_utils.MockBoxPredictor(
is_training, num_classes, predict_mask=predict_mask) is_training, num_classes, add_background_class=add_background_class,
predict_mask=predict_mask)
mock_box_coder = test_utils.MockBoxCoder() mock_box_coder = test_utils.MockBoxCoder()
if use_keras: if use_keras:
fake_feature_extractor = FakeSSDKerasFeatureExtractor() fake_feature_extractor = FakeSSDKerasFeatureExtractor()
...@@ -182,7 +184,7 @@ class SSDMetaArchTestBase(test_case.TestCase): ...@@ -182,7 +184,7 @@ class SSDMetaArchTestBase(test_case.TestCase):
if use_expected_classification_loss_under_sampling: if use_expected_classification_loss_under_sampling:
expected_classification_loss_under_sampling = functools.partial( expected_classification_loss_under_sampling = functools.partial(
ops.expected_classification_loss_under_sampling, ops.expected_classification_loss_under_sampling,
minimum_negative_sampling=minimum_negative_sampling, min_num_negative_samples=min_num_negative_samples,
desired_negative_sampling_ratio=desired_negative_sampling_ratio) desired_negative_sampling_ratio=desired_negative_sampling_ratio)
code_size = 4 code_size = 4
......
...@@ -248,27 +248,30 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator): ...@@ -248,27 +248,30 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
detection_boxes_batched, detection_boxes_batched,
detection_scores_batched, detection_scores_batched,
detection_classes_batched, detection_classes_batched,
num_det_boxes_per_image): num_det_boxes_per_image,
is_annotated_batched):
"""Update operation for adding batch of images to Coco evaluator.""" """Update operation for adding batch of images to Coco evaluator."""
for (image_id, gt_box, gt_class, gt_is_crowd, num_gt_box, det_box, for (image_id, gt_box, gt_class, gt_is_crowd, num_gt_box, det_box,
det_score, det_class, num_det_box) in zip( det_score, det_class, num_det_box, is_annotated) in zip(
image_id_batched, groundtruth_boxes_batched, image_id_batched, groundtruth_boxes_batched,
groundtruth_classes_batched, groundtruth_is_crowd_batched, groundtruth_classes_batched, groundtruth_is_crowd_batched,
num_gt_boxes_per_image, num_gt_boxes_per_image,
detection_boxes_batched, detection_scores_batched, detection_boxes_batched, detection_scores_batched,
detection_classes_batched, num_det_boxes_per_image): detection_classes_batched, num_det_boxes_per_image,
self.add_single_ground_truth_image_info( is_annotated_batched):
image_id, { if is_annotated:
'groundtruth_boxes': gt_box[:num_gt_box], self.add_single_ground_truth_image_info(
'groundtruth_classes': gt_class[:num_gt_box], image_id, {
'groundtruth_is_crowd': gt_is_crowd[:num_gt_box] 'groundtruth_boxes': gt_box[:num_gt_box],
}) 'groundtruth_classes': gt_class[:num_gt_box],
self.add_single_detected_image_info( 'groundtruth_is_crowd': gt_is_crowd[:num_gt_box]
image_id, })
{'detection_boxes': det_box[:num_det_box], self.add_single_detected_image_info(
'detection_scores': det_score[:num_det_box], image_id,
'detection_classes': det_class[:num_det_box]}) {'detection_boxes': det_box[:num_det_box],
'detection_scores': det_score[:num_det_box],
'detection_classes': det_class[:num_det_box]})
# Unpack items from the evaluation dictionary. # Unpack items from the evaluation dictionary.
input_data_fields = standard_fields.InputDataFields input_data_fields = standard_fields.InputDataFields
...@@ -284,6 +287,7 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator): ...@@ -284,6 +287,7 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
num_gt_boxes_per_image = eval_dict.get( num_gt_boxes_per_image = eval_dict.get(
'num_groundtruth_boxes_per_image', None) 'num_groundtruth_boxes_per_image', None)
num_det_boxes_per_image = eval_dict.get('num_det_boxes_per_image', None) num_det_boxes_per_image = eval_dict.get('num_det_boxes_per_image', None)
is_annotated = eval_dict.get('is_annotated', None)
if groundtruth_is_crowd is None: if groundtruth_is_crowd is None:
groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool) groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
...@@ -306,6 +310,11 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator): ...@@ -306,6 +310,11 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
num_det_boxes_per_image = tf.shape(detection_boxes)[1:2] num_det_boxes_per_image = tf.shape(detection_boxes)[1:2]
else: else:
num_det_boxes_per_image = tf.expand_dims(num_det_boxes_per_image, 0) num_det_boxes_per_image = tf.expand_dims(num_det_boxes_per_image, 0)
if is_annotated is None:
is_annotated = tf.constant([True])
else:
is_annotated = tf.expand_dims(is_annotated, 0)
else: else:
if num_gt_boxes_per_image is None: if num_gt_boxes_per_image is None:
num_gt_boxes_per_image = tf.tile( num_gt_boxes_per_image = tf.tile(
...@@ -315,6 +324,8 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator): ...@@ -315,6 +324,8 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
num_det_boxes_per_image = tf.tile( num_det_boxes_per_image = tf.tile(
tf.shape(detection_boxes)[1:2], tf.shape(detection_boxes)[1:2],
multiples=tf.shape(detection_boxes)[0:1]) multiples=tf.shape(detection_boxes)[0:1])
if is_annotated is None:
is_annotated = tf.ones_like(image_id, dtype=tf.bool)
update_op = tf.py_func(update_op, [image_id, update_op = tf.py_func(update_op, [image_id,
groundtruth_boxes, groundtruth_boxes,
...@@ -324,7 +335,8 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator): ...@@ -324,7 +335,8 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
detection_boxes, detection_boxes,
detection_scores, detection_scores,
detection_classes, detection_classes,
num_det_boxes_per_image], []) num_det_boxes_per_image,
is_annotated], [])
metric_names = ['DetectionBoxes_Precision/mAP', metric_names = ['DetectionBoxes_Precision/mAP',
'DetectionBoxes_Precision/mAP@.50IOU', 'DetectionBoxes_Precision/mAP@.50IOU',
'DetectionBoxes_Precision/mAP@.75IOU', 'DetectionBoxes_Precision/mAP@.75IOU',
...@@ -581,8 +593,11 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator): ...@@ -581,8 +593,11 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
Args: Args:
eval_dict: A dictionary that holds tensors for evaluating object detection eval_dict: A dictionary that holds tensors for evaluating object detection
performance. This dictionary may be produced from performance. For single-image evaluation, this dictionary may be
eval_util.result_dict_for_single_example(). produced from eval_util.result_dict_for_single_example(). If multi-image
evaluation, `eval_dict` should contain the fields
'num_groundtruth_boxes_per_image' and 'num_det_boxes_per_image' to
properly unpad the tensors from the batch.
Returns: Returns:
a dictionary of metric names to tuple of value_op and update_op that can a dictionary of metric names to tuple of value_op and update_op that can
...@@ -590,27 +605,41 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator): ...@@ -590,27 +605,41 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
update ops must be run together and similarly all value ops must be run update ops must be run together and similarly all value ops must be run
together to guarantee correct behaviour. together to guarantee correct behaviour.
""" """
def update_op(
image_id, def update_op(image_id_batched, groundtruth_boxes_batched,
groundtruth_boxes, groundtruth_classes_batched,
groundtruth_classes, groundtruth_instance_masks_batched,
groundtruth_instance_masks, groundtruth_is_crowd_batched, num_gt_boxes_per_image,
groundtruth_is_crowd, detection_scores_batched, detection_classes_batched,
detection_scores, detection_masks_batched, num_det_boxes_per_image):
detection_classes,
detection_masks):
"""Update op for metrics.""" """Update op for metrics."""
self.add_single_ground_truth_image_info(
image_id, for (image_id, groundtruth_boxes, groundtruth_classes,
{'groundtruth_boxes': groundtruth_boxes, groundtruth_instance_masks, groundtruth_is_crowd, num_gt_box,
'groundtruth_classes': groundtruth_classes, detection_scores, detection_classes,
'groundtruth_instance_masks': groundtruth_instance_masks, detection_masks, num_det_box) in zip(
'groundtruth_is_crowd': groundtruth_is_crowd}) image_id_batched, groundtruth_boxes_batched,
self.add_single_detected_image_info( groundtruth_classes_batched, groundtruth_instance_masks_batched,
image_id, groundtruth_is_crowd_batched, num_gt_boxes_per_image,
{'detection_scores': detection_scores, detection_scores_batched, detection_classes_batched,
'detection_classes': detection_classes, detection_masks_batched, num_det_boxes_per_image):
'detection_masks': detection_masks}) self.add_single_ground_truth_image_info(
image_id, {
'groundtruth_boxes':
groundtruth_boxes[:num_gt_box],
'groundtruth_classes':
groundtruth_classes[:num_gt_box],
'groundtruth_instance_masks':
groundtruth_instance_masks[:num_gt_box],
'groundtruth_is_crowd':
groundtruth_is_crowd[:num_gt_box]
})
self.add_single_detected_image_info(
image_id, {
'detection_scores': detection_scores[:num_det_box],
'detection_classes': detection_classes[:num_det_box],
'detection_masks': detection_masks[:num_det_box]
})
# Unpack items from the evaluation dictionary. # Unpack items from the evaluation dictionary.
input_data_fields = standard_fields.InputDataFields input_data_fields = standard_fields.InputDataFields
...@@ -622,20 +651,54 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator): ...@@ -622,20 +651,54 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
input_data_fields.groundtruth_instance_masks] input_data_fields.groundtruth_instance_masks]
groundtruth_is_crowd = eval_dict.get( groundtruth_is_crowd = eval_dict.get(
input_data_fields.groundtruth_is_crowd, None) input_data_fields.groundtruth_is_crowd, None)
num_gt_boxes_per_image = eval_dict.get(
input_data_fields.num_groundtruth_boxes, None)
detection_scores = eval_dict[detection_fields.detection_scores] detection_scores = eval_dict[detection_fields.detection_scores]
detection_classes = eval_dict[detection_fields.detection_classes] detection_classes = eval_dict[detection_fields.detection_classes]
detection_masks = eval_dict[detection_fields.detection_masks] detection_masks = eval_dict[detection_fields.detection_masks]
num_det_boxes_per_image = eval_dict.get(detection_fields.num_detections,
None)
if groundtruth_is_crowd is None: if groundtruth_is_crowd is None:
groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool) groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool)
update_op = tf.py_func(update_op, [image_id,
groundtruth_boxes, if not image_id.shape.as_list():
groundtruth_classes, # Apply a batch dimension to all tensors.
groundtruth_instance_masks, image_id = tf.expand_dims(image_id, 0)
groundtruth_is_crowd, groundtruth_boxes = tf.expand_dims(groundtruth_boxes, 0)
detection_scores, groundtruth_classes = tf.expand_dims(groundtruth_classes, 0)
detection_classes, groundtruth_instance_masks = tf.expand_dims(groundtruth_instance_masks, 0)
detection_masks], []) groundtruth_is_crowd = tf.expand_dims(groundtruth_is_crowd, 0)
detection_scores = tf.expand_dims(detection_scores, 0)
detection_classes = tf.expand_dims(detection_classes, 0)
detection_masks = tf.expand_dims(detection_masks, 0)
if num_gt_boxes_per_image is None:
num_gt_boxes_per_image = tf.shape(groundtruth_boxes)[1:2]
else:
num_gt_boxes_per_image = tf.expand_dims(num_gt_boxes_per_image, 0)
if num_det_boxes_per_image is None:
num_det_boxes_per_image = tf.shape(detection_scores)[1:2]
else:
num_det_boxes_per_image = tf.expand_dims(num_det_boxes_per_image, 0)
else:
if num_gt_boxes_per_image is None:
num_gt_boxes_per_image = tf.tile(
tf.shape(groundtruth_boxes)[1:2],
multiples=tf.shape(groundtruth_boxes)[0:1])
if num_det_boxes_per_image is None:
num_det_boxes_per_image = tf.tile(
tf.shape(detection_scores)[1:2],
multiples=tf.shape(detection_scores)[0:1])
update_op = tf.py_func(update_op, [
image_id, groundtruth_boxes, groundtruth_classes,
groundtruth_instance_masks, groundtruth_is_crowd,
num_gt_boxes_per_image, detection_scores, detection_classes,
detection_masks, num_det_boxes_per_image
], [])
metric_names = ['DetectionMasks_Precision/mAP', metric_names = ['DetectionMasks_Precision/mAP',
'DetectionMasks_Precision/mAP@.50IOU', 'DetectionMasks_Precision/mAP@.50IOU',
'DetectionMasks_Precision/mAP@.75IOU', 'DetectionMasks_Precision/mAP@.75IOU',
......
...@@ -308,6 +308,99 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase): ...@@ -308,6 +308,99 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
self.assertFalse(coco_evaluator._detection_boxes_list) self.assertFalse(coco_evaluator._detection_boxes_list)
self.assertFalse(coco_evaluator._image_ids) self.assertFalse(coco_evaluator._image_ids)
def testGetOneMAPWithMatchingGroundtruthAndDetectionsIsAnnotated(self):
coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
_get_categories_list())
image_id = tf.placeholder(tf.string, shape=())
groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4))
groundtruth_classes = tf.placeholder(tf.float32, shape=(None))
is_annotated = tf.placeholder(tf.bool, shape=())
detection_boxes = tf.placeholder(tf.float32, shape=(None, 4))
detection_scores = tf.placeholder(tf.float32, shape=(None))
detection_classes = tf.placeholder(tf.float32, shape=(None))
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
eval_dict = {
input_data_fields.key: image_id,
input_data_fields.groundtruth_boxes: groundtruth_boxes,
input_data_fields.groundtruth_classes: groundtruth_classes,
'is_annotated': is_annotated,
detection_fields.detection_boxes: detection_boxes,
detection_fields.detection_scores: detection_scores,
detection_fields.detection_classes: detection_classes
}
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(eval_dict)
_, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
with self.test_session() as sess:
sess.run(update_op,
feed_dict={
image_id: 'image1',
groundtruth_boxes: np.array([[100., 100., 200., 200.]]),
groundtruth_classes: np.array([1]),
is_annotated: True,
detection_boxes: np.array([[100., 100., 200., 200.]]),
detection_scores: np.array([.8]),
detection_classes: np.array([1])
})
sess.run(update_op,
feed_dict={
image_id: 'image2',
groundtruth_boxes: np.array([[50., 50., 100., 100.]]),
groundtruth_classes: np.array([3]),
is_annotated: True,
detection_boxes: np.array([[50., 50., 100., 100.]]),
detection_scores: np.array([.7]),
detection_classes: np.array([3])
})
sess.run(update_op,
feed_dict={
image_id: 'image3',
groundtruth_boxes: np.array([[25., 25., 50., 50.]]),
groundtruth_classes: np.array([2]),
is_annotated: True,
detection_boxes: np.array([[25., 25., 50., 50.]]),
detection_scores: np.array([.9]),
detection_classes: np.array([2])
})
sess.run(update_op,
feed_dict={
image_id: 'image4',
groundtruth_boxes: np.zeros((0, 4)),
groundtruth_classes: np.zeros((0)),
is_annotated: False, # Note that this image isn't annotated.
detection_boxes: np.array([[25., 25., 50., 50.],
[25., 25., 70., 50.],
[25., 25., 80., 50.],
[25., 25., 90., 50.]]),
detection_scores: np.array([0.6, 0.7, 0.8, 0.9]),
detection_classes: np.array([1, 2, 2, 3])
})
metrics = {}
for key, (value_op, _) in eval_metric_ops.iteritems():
metrics[key] = value_op
metrics = sess.run(metrics)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
self.assertFalse(coco_evaluator._groundtruth_list)
self.assertFalse(coco_evaluator._detection_boxes_list)
self.assertFalse(coco_evaluator._image_ids)
def testGetOneMAPWithMatchingGroundtruthAndDetectionsPadded(self): def testGetOneMAPWithMatchingGroundtruthAndDetectionsPadded(self):
coco_evaluator = coco_evaluation.CocoDetectionEvaluator( coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
_get_categories_list()) _get_categories_list())
...@@ -665,22 +758,40 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase): ...@@ -665,22 +758,40 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
_, update_op = eval_metric_ops['DetectionMasks_Precision/mAP'] _, update_op = eval_metric_ops['DetectionMasks_Precision/mAP']
with self.test_session() as sess: with self.test_session() as sess:
sess.run(update_op, sess.run(
feed_dict={ update_op,
image_id: 'image1', feed_dict={
groundtruth_boxes: np.array([[100., 100., 200., 200.]]), image_id:
groundtruth_classes: np.array([1]), 'image1',
groundtruth_masks: np.pad(np.ones([1, 100, 100], groundtruth_boxes:
dtype=np.uint8), np.array([[100., 100., 200., 200.], [50., 50., 100., 100.]]),
((0, 0), (10, 10), (10, 10)), groundtruth_classes:
mode='constant'), np.array([1, 2]),
detection_scores: np.array([.8]), groundtruth_masks:
detection_classes: np.array([1]), np.stack([
detection_masks: np.pad(np.ones([1, 100, 100], np.pad(
dtype=np.uint8), np.ones([100, 100], dtype=np.uint8), ((10, 10),
((0, 0), (10, 10), (10, 10)), (10, 10)),
mode='constant') mode='constant'),
}) np.pad(
np.ones([50, 50], dtype=np.uint8), ((0, 70), (0, 70)),
mode='constant')
]),
detection_scores:
np.array([.9, .8]),
detection_classes:
np.array([2, 1]),
detection_masks:
np.stack([
np.pad(
np.ones([50, 50], dtype=np.uint8), ((0, 70), (0, 70)),
mode='constant'),
np.pad(
np.ones([100, 100], dtype=np.uint8), ((10, 10),
(10, 10)),
mode='constant'),
])
})
sess.run(update_op, sess.run(update_op,
feed_dict={ feed_dict={
image_id: 'image2', image_id: 'image2',
...@@ -735,6 +846,106 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase): ...@@ -735,6 +846,106 @@ class CocoMaskEvaluationPyFuncTest(tf.test.TestCase):
self.assertFalse(coco_evaluator._image_id_to_mask_shape_map) self.assertFalse(coco_evaluator._image_id_to_mask_shape_map)
self.assertFalse(coco_evaluator._detection_masks_list) self.assertFalse(coco_evaluator._detection_masks_list)
def testGetOneMAPWithMatchingGroundtruthAndDetectionsBatched(self):
coco_evaluator = coco_evaluation.CocoMaskEvaluator(_get_categories_list())
batch_size = 3
image_id = tf.placeholder(tf.string, shape=(batch_size))
groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
groundtruth_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
groundtruth_masks = tf.placeholder(
tf.uint8, shape=(batch_size, None, None, None))
detection_scores = tf.placeholder(tf.float32, shape=(batch_size, None))
detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
detection_masks = tf.placeholder(
tf.uint8, shape=(batch_size, None, None, None))
input_data_fields = standard_fields.InputDataFields
detection_fields = standard_fields.DetectionResultFields
eval_dict = {
input_data_fields.key: image_id,
input_data_fields.groundtruth_boxes: groundtruth_boxes,
input_data_fields.groundtruth_classes: groundtruth_classes,
input_data_fields.groundtruth_instance_masks: groundtruth_masks,
detection_fields.detection_scores: detection_scores,
detection_fields.detection_classes: detection_classes,
detection_fields.detection_masks: detection_masks,
}
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(eval_dict)
_, update_op = eval_metric_ops['DetectionMasks_Precision/mAP']
with self.test_session() as sess:
sess.run(
update_op,
feed_dict={
image_id: ['image1', 'image2', 'image3'],
groundtruth_boxes:
np.array([[[100., 100., 200., 200.]],
[[50., 50., 100., 100.]],
[[25., 25., 50., 50.]]]),
groundtruth_classes:
np.array([[1], [1], [1]]),
groundtruth_masks:
np.stack([
np.pad(
np.ones([1, 100, 100], dtype=np.uint8),
((0, 0), (0, 0), (0, 0)),
mode='constant'),
np.pad(
np.ones([1, 50, 50], dtype=np.uint8),
((0, 0), (25, 25), (25, 25)),
mode='constant'),
np.pad(
np.ones([1, 25, 25], dtype=np.uint8),
((0, 0), (37, 38), (37, 38)),
mode='constant')
],
axis=0),
detection_scores:
np.array([[.8], [.8], [.8]]),
detection_classes:
np.array([[1], [1], [1]]),
detection_masks:
np.stack([
np.pad(
np.ones([1, 100, 100], dtype=np.uint8),
((0, 0), (0, 0), (0, 0)),
mode='constant'),
np.pad(
np.ones([1, 50, 50], dtype=np.uint8),
((0, 0), (25, 25), (25, 25)),
mode='constant'),
np.pad(
np.ones([1, 25, 25], dtype=np.uint8),
((0, 0), (37, 38), (37, 38)),
mode='constant')
],
axis=0)
})
metrics = {}
for key, (value_op, _) in eval_metric_ops.iteritems():
metrics[key] = value_op
metrics = sess.run(metrics)
self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP'], 1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP@.50IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP@.75IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP (medium)'],
1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP (small)'], 1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@1'], 1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@10'], 1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100'], 1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100 (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100 (medium)'],
1.0)
self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100 (small)'], 1.0)
self.assertFalse(coco_evaluator._groundtruth_list)
self.assertFalse(coco_evaluator._image_ids_with_detections)
self.assertFalse(coco_evaluator._image_id_to_mask_shape_map)
self.assertFalse(coco_evaluator._detection_masks_list)
if __name__ == '__main__': if __name__ == '__main__':
tf.test.main() tf.test.main()
...@@ -25,6 +25,7 @@ import os ...@@ -25,6 +25,7 @@ import os
import tensorflow as tf import tensorflow as tf
from object_detection import eval_util from object_detection import eval_util
from object_detection import exporter as exporter_lib
from object_detection import inputs from object_detection import inputs
from object_detection.builders import graph_rewriter_builder from object_detection.builders import graph_rewriter_builder
from object_detection.builders import model_builder from object_detection.builders import model_builder
...@@ -306,8 +307,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False): ...@@ -306,8 +307,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
prediction_dict, features[fields.InputDataFields.true_image_shape]) prediction_dict, features[fields.InputDataFields.true_image_shape])
losses = [loss_tensor for loss_tensor in losses_dict.values()] losses = [loss_tensor for loss_tensor in losses_dict.values()]
if train_config.add_regularization_loss: if train_config.add_regularization_loss:
regularization_losses = tf.get_collection( regularization_losses = detection_model.regularization_losses()
tf.GraphKeys.REGULARIZATION_LOSSES)
if regularization_losses: if regularization_losses:
regularization_loss = tf.add_n( regularization_loss = tf.add_n(
regularization_losses, name='regularization_loss') regularization_losses, name='regularization_loss')
...@@ -353,20 +353,24 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False): ...@@ -353,20 +353,24 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
for var in optimizer_summary_vars: for var in optimizer_summary_vars:
tf.summary.scalar(var.op.name, var) tf.summary.scalar(var.op.name, var)
summaries = [] if use_tpu else None summaries = [] if use_tpu else None
if train_config.summarize_gradients:
summaries = ['gradients', 'gradient_norm', 'global_gradient_norm']
train_op = tf.contrib.layers.optimize_loss( train_op = tf.contrib.layers.optimize_loss(
loss=total_loss, loss=total_loss,
global_step=global_step, global_step=global_step,
learning_rate=None, learning_rate=None,
clip_gradients=clip_gradients_value, clip_gradients=clip_gradients_value,
optimizer=training_optimizer, optimizer=training_optimizer,
update_ops=detection_model.updates(),
variables=trainable_variables, variables=trainable_variables,
summaries=summaries, summaries=summaries,
name='') # Preventing scope prefix on all variables. name='') # Preventing scope prefix on all variables.
if mode == tf.estimator.ModeKeys.PREDICT: if mode == tf.estimator.ModeKeys.PREDICT:
exported_output = exporter_lib.add_output_tensor_nodes(detections)
export_outputs = { export_outputs = {
tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
tf.estimator.export.PredictOutput(detections) tf.estimator.export.PredictOutput(exported_output)
} }
eval_metric_ops = None eval_metric_ops = None
...@@ -456,6 +460,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False): ...@@ -456,6 +460,7 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
def create_estimator_and_inputs(run_config, def create_estimator_and_inputs(run_config,
hparams, hparams,
pipeline_config_path, pipeline_config_path,
config_override=None,
train_steps=None, train_steps=None,
sample_1_of_n_eval_examples=1, sample_1_of_n_eval_examples=1,
sample_1_of_n_eval_on_train_examples=1, sample_1_of_n_eval_on_train_examples=1,
...@@ -465,6 +470,7 @@ def create_estimator_and_inputs(run_config, ...@@ -465,6 +470,7 @@ def create_estimator_and_inputs(run_config,
num_shards=1, num_shards=1,
params=None, params=None,
override_eval_num_epochs=True, override_eval_num_epochs=True,
save_final_config=False,
**kwargs): **kwargs):
"""Creates `Estimator`, input functions, and steps. """Creates `Estimator`, input functions, and steps.
...@@ -472,6 +478,8 @@ def create_estimator_and_inputs(run_config, ...@@ -472,6 +478,8 @@ def create_estimator_and_inputs(run_config,
run_config: A `RunConfig`. run_config: A `RunConfig`.
hparams: A `HParams`. hparams: A `HParams`.
pipeline_config_path: A path to a pipeline config file. pipeline_config_path: A path to a pipeline config file.
config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
override the config from `pipeline_config_path`.
train_steps: Number of training steps. If None, the number of training steps train_steps: Number of training steps. If None, the number of training steps
is set from the `TrainConfig` proto. is set from the `TrainConfig` proto.
sample_1_of_n_eval_examples: Integer representing how often an eval example sample_1_of_n_eval_examples: Integer representing how often an eval example
...@@ -499,6 +507,8 @@ def create_estimator_and_inputs(run_config, ...@@ -499,6 +507,8 @@ def create_estimator_and_inputs(run_config,
`use_tpu_estimator` is True. `use_tpu_estimator` is True.
override_eval_num_epochs: Whether to overwrite the number of epochs to override_eval_num_epochs: Whether to overwrite the number of epochs to
1 for eval_input. 1 for eval_input.
save_final_config: Whether to save final config (obtained after applying
overrides) to `estimator.model_dir`.
**kwargs: Additional keyword arguments for configuration override. **kwargs: Additional keyword arguments for configuration override.
Returns: Returns:
...@@ -522,7 +532,8 @@ def create_estimator_and_inputs(run_config, ...@@ -522,7 +532,8 @@ def create_estimator_and_inputs(run_config,
create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn'] create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn']
create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn'] create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn']
configs = get_configs_from_pipeline_file(pipeline_config_path) configs = get_configs_from_pipeline_file(pipeline_config_path,
config_override=config_override)
kwargs.update({ kwargs.update({
'train_steps': train_steps, 'train_steps': train_steps,
'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples 'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples
...@@ -595,7 +606,7 @@ def create_estimator_and_inputs(run_config, ...@@ -595,7 +606,7 @@ def create_estimator_and_inputs(run_config,
estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
# Write the as-run pipeline config to disk. # Write the as-run pipeline config to disk.
if run_config.is_chief: if run_config.is_chief and save_final_config:
pipeline_config_final = create_pipeline_proto_from_configs(configs) pipeline_config_final = create_pipeline_proto_from_configs(configs)
config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir) config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir)
...@@ -641,11 +652,17 @@ def create_train_and_eval_specs(train_input_fn, ...@@ -641,11 +652,17 @@ def create_train_and_eval_specs(train_input_fn,
input_fn=train_input_fn, max_steps=train_steps) input_fn=train_input_fn, max_steps=train_steps)
if eval_spec_names is None: if eval_spec_names is None:
eval_spec_names = [ str(i) for i in range(len(eval_input_fns)) ] eval_spec_names = [str(i) for i in range(len(eval_input_fns))]
eval_specs = [] eval_specs = []
for eval_spec_name, eval_input_fn in zip(eval_spec_names, eval_input_fns): for index, (eval_spec_name, eval_input_fn) in enumerate(
exporter_name = '{}_{}'.format(final_exporter_name, eval_spec_name) zip(eval_spec_names, eval_input_fns)):
# Uses final_exporter_name as exporter_name for the first eval spec for
# backward compatibility.
if index == 0:
exporter_name = final_exporter_name
else:
exporter_name = '{}_{}'.format(final_exporter_name, eval_spec_name)
exporter = tf.estimator.FinalExporter( exporter = tf.estimator.FinalExporter(
name=exporter_name, serving_input_receiver_fn=predict_input_fn) name=exporter_name, serving_input_receiver_fn=predict_input_fn)
eval_specs.append( eval_specs.append(
...@@ -747,6 +764,7 @@ def populate_experiment(run_config, ...@@ -747,6 +764,7 @@ def populate_experiment(run_config,
train_steps=train_steps, train_steps=train_steps,
eval_steps=eval_steps, eval_steps=eval_steps,
model_fn_creator=model_fn_creator, model_fn_creator=model_fn_creator,
save_final_config=True,
**kwargs) **kwargs)
estimator = train_and_eval_dict['estimator'] estimator = train_and_eval_dict['estimator']
train_input_fn = train_and_eval_dict['train_input_fn'] train_input_fn = train_and_eval_dict['train_input_fn']
......
...@@ -310,7 +310,7 @@ class ModelLibTest(tf.test.TestCase): ...@@ -310,7 +310,7 @@ class ModelLibTest(tf.test.TestCase):
self.assertEqual(2, len(eval_specs)) self.assertEqual(2, len(eval_specs))
self.assertEqual(None, eval_specs[0].steps) self.assertEqual(None, eval_specs[0].steps)
self.assertEqual('holdout', eval_specs[0].name) self.assertEqual('holdout', eval_specs[0].name)
self.assertEqual('exporter_holdout', eval_specs[0].exporters[0].name) self.assertEqual('exporter', eval_specs[0].exporters[0].name)
self.assertEqual(None, eval_specs[1].steps) self.assertEqual(None, eval_specs[1].steps)
self.assertEqual('eval_on_train', eval_specs[1].name) self.assertEqual('eval_on_train', eval_specs[1].name)
......
...@@ -114,6 +114,7 @@ def main(unused_argv): ...@@ -114,6 +114,7 @@ def main(unused_argv):
use_tpu_estimator=True, use_tpu_estimator=True,
use_tpu=FLAGS.use_tpu, use_tpu=FLAGS.use_tpu,
num_shards=FLAGS.num_shards, num_shards=FLAGS.num_shards,
save_final_config=FLAGS.mode == 'train',
**kwargs) **kwargs)
estimator = train_and_eval_dict['estimator'] estimator = train_and_eval_dict['estimator']
train_input_fn = train_and_eval_dict['train_input_fn'] train_input_fn = train_and_eval_dict['train_input_fn']
......
...@@ -72,6 +72,8 @@ class FasterRCNNResnetV1FeatureExtractor( ...@@ -72,6 +72,8 @@ class FasterRCNNResnetV1FeatureExtractor(
VGG style channel mean subtraction as described here: VGG style channel mean subtraction as described here:
https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md
Note that if the number of channels is not equal to 3, the mean subtraction
will be skipped and the original resized_inputs will be returned.
Args: Args:
resized_inputs: A [batch, height_in, width_in, channels] float32 tensor resized_inputs: A [batch, height_in, width_in, channels] float32 tensor
...@@ -82,8 +84,11 @@ class FasterRCNNResnetV1FeatureExtractor( ...@@ -82,8 +84,11 @@ class FasterRCNNResnetV1FeatureExtractor(
tensor representing a batch of images. tensor representing a batch of images.
""" """
channel_means = [123.68, 116.779, 103.939] if resized_inputs.shape.as_list()[3] == 3:
return resized_inputs - [[channel_means]] channel_means = [123.68, 116.779, 103.939]
return resized_inputs - [[channel_means]]
else:
return resized_inputs
def _extract_proposal_features(self, preprocessed_inputs, scope): def _extract_proposal_features(self, preprocessed_inputs, scope):
"""Extracts first stage RPN features. """Extracts first stage RPN features.
......
...@@ -146,7 +146,6 @@ class KerasMultiResolutionFeatureMaps(tf.keras.Model): ...@@ -146,7 +146,6 @@ class KerasMultiResolutionFeatureMaps(tf.keras.Model):
use_depthwise = feature_map_layout['use_depthwise'] use_depthwise = feature_map_layout['use_depthwise']
for index, from_layer in enumerate(feature_map_layout['from_layer']): for index, from_layer in enumerate(feature_map_layout['from_layer']):
net = [] net = []
self.convolutions.append(net)
layer_depth = feature_map_layout['layer_depth'][index] layer_depth = feature_map_layout['layer_depth'][index]
conv_kernel_size = 3 conv_kernel_size = 3
if 'conv_kernel_size' in feature_map_layout: if 'conv_kernel_size' in feature_map_layout:
...@@ -231,6 +230,10 @@ class KerasMultiResolutionFeatureMaps(tf.keras.Model): ...@@ -231,6 +230,10 @@ class KerasMultiResolutionFeatureMaps(tf.keras.Model):
conv_hyperparams.build_activation_layer( conv_hyperparams.build_activation_layer(
name=layer_name)) name=layer_name))
# Until certain bugs are fixed in checkpointable lists,
# this net must be appended only once it's been filled with layers
self.convolutions.append(net)
def call(self, image_features): def call(self, image_features):
"""Generate the multi-resolution feature maps. """Generate the multi-resolution feature maps.
...@@ -263,7 +266,8 @@ class KerasMultiResolutionFeatureMaps(tf.keras.Model): ...@@ -263,7 +266,8 @@ class KerasMultiResolutionFeatureMaps(tf.keras.Model):
def multi_resolution_feature_maps(feature_map_layout, depth_multiplier, def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
min_depth, insert_1x1_conv, image_features): min_depth, insert_1x1_conv, image_features,
pool_residual=False):
"""Generates multi resolution feature maps from input image features. """Generates multi resolution feature maps from input image features.
Generates multi-scale feature maps for detection as in the SSD papers by Generates multi-scale feature maps for detection as in the SSD papers by
...@@ -317,6 +321,13 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier, ...@@ -317,6 +321,13 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
should be inserted before shrinking the feature map. should be inserted before shrinking the feature map.
image_features: A dictionary of handles to activation tensors from the image_features: A dictionary of handles to activation tensors from the
base feature extractor. base feature extractor.
pool_residual: Whether to add an average pooling layer followed by a
residual connection between subsequent feature maps when the channel
depth match. For example, with option 'layer_depth': [-1, 512, 256, 256],
a pooling and residual layer is added between the third and forth feature
map. This option is better used with Weight Shared Convolution Box
Predictor when all feature maps have the same channel depth to encourage
more consistent features across multi-scale feature maps.
Returns: Returns:
feature_maps: an OrderedDict mapping keys (feature map names) to feature_maps: an OrderedDict mapping keys (feature map names) to
...@@ -350,6 +361,7 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier, ...@@ -350,6 +361,7 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
feature_map_keys.append(from_layer) feature_map_keys.append(from_layer)
else: else:
pre_layer = feature_maps[-1] pre_layer = feature_maps[-1]
pre_layer_depth = pre_layer.get_shape().as_list()[3]
intermediate_layer = pre_layer intermediate_layer = pre_layer
if insert_1x1_conv: if insert_1x1_conv:
layer_name = '{}_1_Conv2d_{}_1x1_{}'.format( layer_name = '{}_1_Conv2d_{}_1x1_{}'.format(
...@@ -383,6 +395,12 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier, ...@@ -383,6 +395,12 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
padding='SAME', padding='SAME',
stride=1, stride=1,
scope=layer_name) scope=layer_name)
if pool_residual and pre_layer_depth == depth_fn(layer_depth):
feature_map += slim.avg_pool2d(
pre_layer, [3, 3],
padding='SAME',
stride=2,
scope=layer_name + '_pool')
else: else:
feature_map = slim.conv2d( feature_map = slim.conv2d(
intermediate_layer, intermediate_layer,
...@@ -399,6 +417,7 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier, ...@@ -399,6 +417,7 @@ def multi_resolution_feature_maps(feature_map_layout, depth_multiplier,
def fpn_top_down_feature_maps(image_features, def fpn_top_down_feature_maps(image_features,
depth, depth,
use_depthwise=False, use_depthwise=False,
use_explicit_padding=False,
scope=None): scope=None):
"""Generates `top-down` feature maps for Feature Pyramid Networks. """Generates `top-down` feature maps for Feature Pyramid Networks.
...@@ -409,7 +428,9 @@ def fpn_top_down_feature_maps(image_features, ...@@ -409,7 +428,9 @@ def fpn_top_down_feature_maps(image_features,
Spatial resolutions of succesive tensors must reduce exactly by a factor Spatial resolutions of succesive tensors must reduce exactly by a factor
of 2. of 2.
depth: depth of output feature maps. depth: depth of output feature maps.
use_depthwise: use depthwise separable conv instead of regular conv. use_depthwise: whether to use depthwise separable conv instead of regular
conv.
use_explicit_padding: whether to use explicit padding.
scope: A scope name to wrap this op under. scope: A scope name to wrap this op under.
Returns: Returns:
...@@ -420,8 +441,10 @@ def fpn_top_down_feature_maps(image_features, ...@@ -420,8 +441,10 @@ def fpn_top_down_feature_maps(image_features,
num_levels = len(image_features) num_levels = len(image_features)
output_feature_maps_list = [] output_feature_maps_list = []
output_feature_map_keys = [] output_feature_map_keys = []
padding = 'VALID' if use_explicit_padding else 'SAME'
kernel_size = 3
with slim.arg_scope( with slim.arg_scope(
[slim.conv2d, slim.separable_conv2d], padding='SAME', stride=1): [slim.conv2d, slim.separable_conv2d], padding=padding, stride=1):
top_down = slim.conv2d( top_down = slim.conv2d(
image_features[-1][1], image_features[-1][1],
depth, [1, 1], activation_fn=None, normalizer_fn=None, depth, [1, 1], activation_fn=None, normalizer_fn=None,
...@@ -436,14 +459,20 @@ def fpn_top_down_feature_maps(image_features, ...@@ -436,14 +459,20 @@ def fpn_top_down_feature_maps(image_features,
image_features[level][1], depth, [1, 1], image_features[level][1], depth, [1, 1],
activation_fn=None, normalizer_fn=None, activation_fn=None, normalizer_fn=None,
scope='projection_%d' % (level + 1)) scope='projection_%d' % (level + 1))
if use_explicit_padding:
# slice top_down to the same shape as residual
residual_shape = tf.shape(residual)
top_down = top_down[:, :residual_shape[1], :residual_shape[2], :]
top_down += residual top_down += residual
if use_depthwise: if use_depthwise:
conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1) conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
else: else:
conv_op = slim.conv2d conv_op = slim.conv2d
if use_explicit_padding:
top_down = ops.fixed_padding(top_down, kernel_size)
output_feature_maps_list.append(conv_op( output_feature_maps_list.append(conv_op(
top_down, top_down,
depth, [3, 3], depth, [kernel_size, kernel_size],
scope='smoothing_%d' % (level + 1))) scope='smoothing_%d' % (level + 1)))
output_feature_map_keys.append('top_down_%s' % image_features[level][0]) output_feature_map_keys.append('top_down_%s' % image_features[level][0])
return collections.OrderedDict(reversed( return collections.OrderedDict(reversed(
......
...@@ -45,6 +45,11 @@ EMBEDDED_SSD_MOBILENET_V1_LAYOUT = { ...@@ -45,6 +45,11 @@ EMBEDDED_SSD_MOBILENET_V1_LAYOUT = {
'conv_kernel_size': [-1, -1, 3, 3, 2], 'conv_kernel_size': [-1, -1, 3, 3, 2],
} }
SSD_MOBILENET_V1_WEIGHT_SHARED_LAYOUT = {
'from_layer': ['Conv2d_13_pointwise', '', '', ''],
'layer_depth': [-1, 256, 256, 256],
}
@parameterized.parameters( @parameterized.parameters(
{'use_keras': False}, {'use_keras': False},
...@@ -67,7 +72,8 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase): ...@@ -67,7 +72,8 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams) text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams) return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)
def _build_feature_map_generator(self, feature_map_layout, use_keras): def _build_feature_map_generator(self, feature_map_layout, use_keras,
pool_residual=False):
if use_keras: if use_keras:
return feature_map_generators.KerasMultiResolutionFeatureMaps( return feature_map_generators.KerasMultiResolutionFeatureMaps(
feature_map_layout=feature_map_layout, feature_map_layout=feature_map_layout,
...@@ -86,7 +92,8 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase): ...@@ -86,7 +92,8 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
depth_multiplier=1, depth_multiplier=1,
min_depth=32, min_depth=32,
insert_1x1_conv=True, insert_1x1_conv=True,
image_features=image_features) image_features=image_features,
pool_residual=pool_residual)
return feature_map_generator return feature_map_generator
def test_get_expected_feature_map_shapes_with_inception_v2(self, use_keras): def test_get_expected_feature_map_shapes_with_inception_v2(self, use_keras):
...@@ -209,6 +216,34 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase): ...@@ -209,6 +216,34 @@ class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase):
(key, value.shape) for key, value in out_feature_maps.items()) (key, value.shape) for key, value in out_feature_maps.items())
self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes) self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes)
def test_feature_map_shapes_with_pool_residual_ssd_mobilenet_v1(
self, use_keras):
image_features = {
'Conv2d_13_pointwise': tf.random_uniform([4, 8, 8, 1024],
dtype=tf.float32),
}
feature_map_generator = self._build_feature_map_generator(
feature_map_layout=SSD_MOBILENET_V1_WEIGHT_SHARED_LAYOUT,
use_keras=use_keras,
pool_residual=True
)
feature_maps = feature_map_generator(image_features)
expected_feature_map_shapes = {
'Conv2d_13_pointwise': (4, 8, 8, 1024),
'Conv2d_13_pointwise_2_Conv2d_1_3x3_s2_256': (4, 4, 4, 256),
'Conv2d_13_pointwise_2_Conv2d_2_3x3_s2_256': (4, 2, 2, 256),
'Conv2d_13_pointwise_2_Conv2d_3_3x3_s2_256': (4, 1, 1, 256)}
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
out_feature_maps = sess.run(feature_maps)
out_feature_map_shapes = dict(
(key, value.shape) for key, value in out_feature_maps.items())
self.assertDictEqual(expected_feature_map_shapes, out_feature_map_shapes)
def test_get_expected_variable_names_with_inception_v2(self, use_keras): def test_get_expected_variable_names_with_inception_v2(self, use_keras):
image_features = { image_features = {
'Mixed_3c': tf.random_uniform([4, 28, 28, 256], dtype=tf.float32), 'Mixed_3c': tf.random_uniform([4, 28, 28, 256], dtype=tf.float32),
......
...@@ -82,6 +82,8 @@ class _LayersOverride(object): ...@@ -82,6 +82,8 @@ class _LayersOverride(object):
self._conv_hyperparams = conv_hyperparams self._conv_hyperparams = conv_hyperparams
self._use_explicit_padding = use_explicit_padding self._use_explicit_padding = use_explicit_padding
self._min_depth = min_depth self._min_depth = min_depth
self.regularizer = tf.keras.regularizers.l2(0.00004 * 0.5)
self.initializer = tf.truncated_normal_initializer(stddev=0.09)
def _FixedPaddingLayer(self, kernel_size): def _FixedPaddingLayer(self, kernel_size):
return tf.keras.layers.Lambda(lambda x: ops.fixed_padding(x, kernel_size)) return tf.keras.layers.Lambda(lambda x: ops.fixed_padding(x, kernel_size))
...@@ -114,6 +116,9 @@ class _LayersOverride(object): ...@@ -114,6 +116,9 @@ class _LayersOverride(object):
if self._conv_hyperparams: if self._conv_hyperparams:
kwargs = self._conv_hyperparams.params(**kwargs) kwargs = self._conv_hyperparams.params(**kwargs)
else:
kwargs['kernel_regularizer'] = self.regularizer
kwargs['kernel_initializer'] = self.initializer
kwargs['padding'] = 'same' kwargs['padding'] = 'same'
kernel_size = kwargs.get('kernel_size') kernel_size = kwargs.get('kernel_size')
...@@ -144,6 +149,8 @@ class _LayersOverride(object): ...@@ -144,6 +149,8 @@ class _LayersOverride(object):
""" """
if self._conv_hyperparams: if self._conv_hyperparams:
kwargs = self._conv_hyperparams.params(**kwargs) kwargs = self._conv_hyperparams.params(**kwargs)
else:
kwargs['depthwise_initializer'] = self.initializer
kwargs['padding'] = 'same' kwargs['padding'] = 'same'
kernel_size = kwargs.get('kernel_size') kernel_size = kwargs.get('kernel_size')
......
...@@ -31,11 +31,10 @@ slim = tf.contrib.slim ...@@ -31,11 +31,10 @@ slim = tf.contrib.slim
# A modified config of mobilenet v1 that makes it more detection friendly, # A modified config of mobilenet v1 that makes it more detection friendly,
def _create_modified_mobilenet_config(): def _create_modified_mobilenet_config():
conv_defs = copy.copy(mobilenet_v1.MOBILENETV1_CONV_DEFS) conv_defs = copy.deepcopy(mobilenet_v1.MOBILENETV1_CONV_DEFS)
conv_defs[-2] = mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=512) conv_defs[-2] = mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=512)
conv_defs[-1] = mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=256) conv_defs[-1] = mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=256)
return conv_defs return conv_defs
_CONV_DEFS = _create_modified_mobilenet_config()
class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
...@@ -98,6 +97,9 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -98,6 +97,9 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
self._fpn_min_level = fpn_min_level self._fpn_min_level = fpn_min_level
self._fpn_max_level = fpn_max_level self._fpn_max_level = fpn_max_level
self._additional_layer_depth = additional_layer_depth self._additional_layer_depth = additional_layer_depth
self._conv_defs = None
if self._use_depthwise:
self._conv_defs = _create_modified_mobilenet_config()
def preprocess(self, resized_inputs): def preprocess(self, resized_inputs):
"""SSD preprocessing. """SSD preprocessing.
...@@ -141,7 +143,7 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -141,7 +143,7 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
final_endpoint='Conv2d_13_pointwise', final_endpoint='Conv2d_13_pointwise',
min_depth=self._min_depth, min_depth=self._min_depth,
depth_multiplier=self._depth_multiplier, depth_multiplier=self._depth_multiplier,
conv_defs=_CONV_DEFS if self._use_depthwise else None, conv_defs=self._conv_defs,
use_explicit_padding=self._use_explicit_padding, use_explicit_padding=self._use_explicit_padding,
scope=scope) scope=scope)
...@@ -159,7 +161,8 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -159,7 +161,8 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
fpn_features = feature_map_generators.fpn_top_down_feature_maps( fpn_features = feature_map_generators.fpn_top_down_feature_maps(
[(key, image_features[key]) for key in feature_block_list], [(key, image_features[key]) for key in feature_block_list],
depth=depth_fn(self._additional_layer_depth), depth=depth_fn(self._additional_layer_depth),
use_depthwise=self._use_depthwise) use_depthwise=self._use_depthwise,
use_explicit_padding=self._use_explicit_padding)
feature_maps = [] feature_maps = []
for level in range(self._fpn_min_level, base_fpn_max_level + 1): for level in range(self._fpn_min_level, base_fpn_max_level + 1):
feature_maps.append(fpn_features['top_down_{}'.format( feature_maps.append(fpn_features['top_down_{}'.format(
...@@ -167,18 +170,23 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -167,18 +170,23 @@ class SSDMobileNetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
last_feature_map = fpn_features['top_down_{}'.format( last_feature_map = fpn_features['top_down_{}'.format(
feature_blocks[base_fpn_max_level - 2])] feature_blocks[base_fpn_max_level - 2])]
# Construct coarse features # Construct coarse features
padding = 'VALID' if self._use_explicit_padding else 'SAME'
kernel_size = 3
for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
if self._use_depthwise: if self._use_depthwise:
conv_op = functools.partial( conv_op = functools.partial(
slim.separable_conv2d, depth_multiplier=1) slim.separable_conv2d, depth_multiplier=1)
else: else:
conv_op = slim.conv2d conv_op = slim.conv2d
if self._use_explicit_padding:
last_feature_map = ops.fixed_padding(
last_feature_map, kernel_size)
last_feature_map = conv_op( last_feature_map = conv_op(
last_feature_map, last_feature_map,
num_outputs=depth_fn(self._additional_layer_depth), num_outputs=depth_fn(self._additional_layer_depth),
kernel_size=[3, 3], kernel_size=[kernel_size, kernel_size],
stride=2, stride=2,
padding='SAME', padding=padding,
scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 13)) scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 13))
feature_maps.append(last_feature_map) feature_maps.append(last_feature_map)
return feature_maps return feature_maps
...@@ -30,17 +30,14 @@ from nets.mobilenet import mobilenet_v2 ...@@ -30,17 +30,14 @@ from nets.mobilenet import mobilenet_v2
slim = tf.contrib.slim slim = tf.contrib.slim
# A modified config of mobilenet v2 that makes it more detection friendly, # A modified config of mobilenet v2 that makes it more detection friendly.
def _create_modified_mobilenet_config(): def _create_modified_mobilenet_config():
conv_defs = copy.copy(mobilenet_v2.V2_DEF) conv_defs = copy.deepcopy(mobilenet_v2.V2_DEF)
conv_defs['spec'][-1] = mobilenet.op( conv_defs['spec'][-1] = mobilenet.op(
slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=256) slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=256)
return conv_defs return conv_defs
_CONV_DEFS = _create_modified_mobilenet_config()
class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
"""SSD Feature Extractor using MobilenetV2 FPN features.""" """SSD Feature Extractor using MobilenetV2 FPN features."""
...@@ -100,6 +97,9 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -100,6 +97,9 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
self._fpn_min_level = fpn_min_level self._fpn_min_level = fpn_min_level
self._fpn_max_level = fpn_max_level self._fpn_max_level = fpn_max_level
self._additional_layer_depth = additional_layer_depth self._additional_layer_depth = additional_layer_depth
self._conv_defs = None
if self._use_depthwise:
self._conv_defs = _create_modified_mobilenet_config()
def preprocess(self, resized_inputs): def preprocess(self, resized_inputs):
"""SSD preprocessing. """SSD preprocessing.
...@@ -142,7 +142,7 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -142,7 +142,7 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
final_endpoint='layer_19', final_endpoint='layer_19',
depth_multiplier=self._depth_multiplier, depth_multiplier=self._depth_multiplier,
conv_defs=_CONV_DEFS if self._use_depthwise else None, conv_defs=self._conv_defs,
use_explicit_padding=self._use_explicit_padding, use_explicit_padding=self._use_explicit_padding,
scope=scope) scope=scope)
depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth) depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth)
...@@ -158,7 +158,8 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -158,7 +158,8 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
fpn_features = feature_map_generators.fpn_top_down_feature_maps( fpn_features = feature_map_generators.fpn_top_down_feature_maps(
[(key, image_features[key]) for key in feature_block_list], [(key, image_features[key]) for key in feature_block_list],
depth=depth_fn(self._additional_layer_depth), depth=depth_fn(self._additional_layer_depth),
use_depthwise=self._use_depthwise) use_depthwise=self._use_depthwise,
use_explicit_padding=self._use_explicit_padding)
feature_maps = [] feature_maps = []
for level in range(self._fpn_min_level, base_fpn_max_level + 1): for level in range(self._fpn_min_level, base_fpn_max_level + 1):
feature_maps.append(fpn_features['top_down_{}'.format( feature_maps.append(fpn_features['top_down_{}'.format(
...@@ -166,18 +167,23 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -166,18 +167,23 @@ class SSDMobileNetV2FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
last_feature_map = fpn_features['top_down_{}'.format( last_feature_map = fpn_features['top_down_{}'.format(
feature_blocks[base_fpn_max_level - 2])] feature_blocks[base_fpn_max_level - 2])]
# Construct coarse features # Construct coarse features
padding = 'VALID' if self._use_explicit_padding else 'SAME'
kernel_size = 3
for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
if self._use_depthwise: if self._use_depthwise:
conv_op = functools.partial( conv_op = functools.partial(
slim.separable_conv2d, depth_multiplier=1) slim.separable_conv2d, depth_multiplier=1)
else: else:
conv_op = slim.conv2d conv_op = slim.conv2d
if self._use_explicit_padding:
last_feature_map = ops.fixed_padding(
last_feature_map, kernel_size)
last_feature_map = conv_op( last_feature_map = conv_op(
last_feature_map, last_feature_map,
num_outputs=depth_fn(self._additional_layer_depth), num_outputs=depth_fn(self._additional_layer_depth),
kernel_size=[3, 3], kernel_size=[kernel_size, kernel_size],
stride=2, stride=2,
padding='SAME', padding=padding,
scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19)) scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19))
feature_maps.append(last_feature_map) feature_maps.append(last_feature_map)
return feature_maps return feature_maps
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment