Commit 3b158095 authored by Ilya Mironov's avatar Ilya Mironov
Browse files

Merge branch 'master' of https://github.com/ilyamironov/models

parents a90db800 be659c2f
......@@ -23,6 +23,7 @@ Localization losses:
Classification losses:
* WeightedSigmoidClassificationLoss
* WeightedSoftmaxClassificationLoss
* WeightedSoftmaxClassificationAgainstLogitsLoss
* BootstrappedSigmoidClassificationLoss
"""
from abc import ABCMeta
......@@ -317,6 +318,54 @@ class WeightedSoftmaxClassificationLoss(Loss):
return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights
class WeightedSoftmaxClassificationAgainstLogitsLoss(Loss):
"""Softmax loss function against logits.
Targets are expected to be provided in logits space instead of "one hot" or
"probability distribution" space.
"""
def __init__(self, logit_scale=1.0):
"""Constructor.
Args:
logit_scale: When this value is high, the target is "diffused" and
when this value is low, the target is made peakier.
(default 1.0)
"""
self._logit_scale = logit_scale
def _scale_and_softmax_logits(self, logits):
"""Scale logits then apply softmax."""
scaled_logits = tf.divide(logits, self._logit_scale, name='scale_logits')
return tf.nn.softmax(scaled_logits, name='convert_scores')
def _compute_loss(self, prediction_tensor, target_tensor, weights):
"""Compute loss function.
Args:
prediction_tensor: A float tensor of shape [batch_size, num_anchors,
num_classes] representing the predicted logits for each class
target_tensor: A float tensor of shape [batch_size, num_anchors,
num_classes] representing logit classification targets
weights: a float tensor of shape [batch_size, num_anchors]
Returns:
loss: a float tensor of shape [batch_size, num_anchors]
representing the value of the loss function.
"""
num_classes = prediction_tensor.get_shape().as_list()[-1]
target_tensor = self._scale_and_softmax_logits(target_tensor)
prediction_tensor = tf.divide(prediction_tensor, self._logit_scale,
name='scale_logits')
per_row_cross_ent = (tf.nn.softmax_cross_entropy_with_logits(
labels=tf.reshape(target_tensor, [-1, num_classes]),
logits=tf.reshape(prediction_tensor, [-1, num_classes])))
return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights
class BootstrappedSigmoidClassificationLoss(Loss):
"""Bootstrapped sigmoid cross entropy classification loss function.
......
......@@ -576,6 +576,111 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
self.assertAllClose(loss_output, exp_loss)
class WeightedSoftmaxClassificationAgainstLogitsLossTest(tf.test.TestCase):
def testReturnsCorrectLoss(self):
prediction_tensor = tf.constant([[[-100, 100, -100],
[100, -100, -100],
[0, 0, -100],
[-100, -100, 100]],
[[-100, 0, 0],
[-100, 100, -100],
[-100, 100, -100],
[100, -100, -100]]], tf.float32)
target_tensor = tf.constant([[[-100, 100, -100],
[100, -100, -100],
[100, -100, -100],
[-100, -100, 100]],
[[-100, -100, 100],
[-100, 100, -100],
[-100, 100, -100],
[100, -100, -100]]], tf.float32)
weights = tf.constant([[1, 1, .5, 1],
[1, 1, 1, 1]], tf.float32)
loss_op = losses.WeightedSoftmaxClassificationAgainstLogitsLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss)
exp_loss = - 1.5 * math.log(.5)
with self.test_session() as sess:
loss_output = sess.run(loss)
self.assertAllClose(loss_output, exp_loss)
def testReturnsCorrectAnchorWiseLoss(self):
prediction_tensor = tf.constant([[[-100, 100, -100],
[100, -100, -100],
[0, 0, -100],
[-100, -100, 100]],
[[-100, 0, 0],
[-100, 100, -100],
[-100, 100, -100],
[100, -100, -100]]], tf.float32)
target_tensor = tf.constant([[[-100, 100, -100],
[100, -100, -100],
[100, -100, -100],
[-100, -100, 100]],
[[-100, -100, 100],
[-100, 100, -100],
[-100, 100, -100],
[100, -100, -100]]], tf.float32)
weights = tf.constant([[1, 1, .5, 1],
[1, 1, 1, 0]], tf.float32)
loss_op = losses.WeightedSoftmaxClassificationAgainstLogitsLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
exp_loss = np.matrix([[0, 0, - 0.5 * math.log(.5), 0],
[-math.log(.5), 0, 0, 0]])
with self.test_session() as sess:
loss_output = sess.run(loss)
self.assertAllClose(loss_output, exp_loss)
def testReturnsCorrectAnchorWiseLossWithLogitScaleSetting(self):
logit_scale = 100.
prediction_tensor = tf.constant([[[-100, 100, -100],
[100, -100, -100],
[0, 0, -100],
[-100, -100, 100]],
[[-100, 0, 0],
[-100, 100, -100],
[-100, 100, -100],
[100, -100, -100]]], tf.float32)
target_tensor = tf.constant([[[-100, 100, -100],
[100, -100, -100],
[0, 0, -100],
[-100, -100, 100]],
[[-100, 0, 0],
[-100, 100, -100],
[-100, 100, -100],
[100, -100, -100]]], tf.float32)
weights = tf.constant([[1, 1, .5, 1],
[1, 1, 1, 0]], tf.float32)
loss_op = losses.WeightedSoftmaxClassificationAgainstLogitsLoss(
logit_scale=logit_scale)
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
# find softmax of the two prediction types above
softmax_pred1 = [np.exp(-1), np.exp(-1), np.exp(1)]
softmax_pred1 /= sum(softmax_pred1)
softmax_pred2 = [np.exp(0), np.exp(0), np.exp(-1)]
softmax_pred2 /= sum(softmax_pred2)
# compute the expected cross entropy for perfect matches
exp_entropy1 = sum(
[-x*np.log(x) for x in softmax_pred1])
exp_entropy2 = sum(
[-x*np.log(x) for x in softmax_pred2])
# weighted expected losses
exp_loss = np.matrix(
[[exp_entropy1, exp_entropy1, exp_entropy2*.5, exp_entropy1],
[exp_entropy2, exp_entropy1, exp_entropy1, 0.]])
with self.test_session() as sess:
loss_output = sess.run(loss)
self.assertAllClose(loss_output, exp_loss)
class BootstrappedSigmoidClassificationLossTest(tf.test.TestCase):
def testReturnsCorrectLossSoftBootstrapping(self):
......
......@@ -1000,8 +1000,8 @@ def random_adjust_saturation(image,
def random_distort_color(image, color_ordering=0, preprocess_vars_cache=None):
"""Randomly distorts color.
Randomly distorts color using a combination of brightness, hue, contrast
and saturation changes. Makes sure the output image is still between 0 and 255.
Randomly distorts color using a combination of brightness, hue, contrast and
saturation changes. Makes sure the output image is still between 0 and 255.
Args:
image: rank 3 float32 tensor contains 1 image -> [height, width, channels]
......
......@@ -2620,16 +2620,24 @@ class PreprocessorTest(tf.test.TestCase):
distorted_images_rank = tf.rank(distorted_images)
boxes_rank = tf.rank(boxes)
distorted_boxes_rank = tf.rank(distorted_boxes)
multiclass_scores_rank = tf.rank(multiclass_scores)
distorted_multiclass_scores_rank = tf.rank(distorted_multiclass_scores)
with self.test_session() as sess:
(boxes_rank_, distorted_boxes_rank_, images_rank_, distorted_images_rank_,
multiclass_scores_, distorted_multiclass_scores_) = sess.run([
boxes_rank, distorted_boxes_rank, images_rank, distorted_images_rank,
multiclass_scores, distorted_multiclass_scores
(boxes_rank_, distorted_boxes_, distorted_boxes_rank_, images_rank_,
distorted_images_rank_, multiclass_scores_rank_,
distorted_multiclass_scores_,
distorted_multiclass_scores_rank_) = sess.run([
boxes_rank, distorted_boxes, distorted_boxes_rank, images_rank,
distorted_images_rank, multiclass_scores_rank,
distorted_multiclass_scores, distorted_multiclass_scores_rank
])
self.assertAllEqual(boxes_rank_, distorted_boxes_rank_)
self.assertAllEqual(images_rank_, distorted_images_rank_)
self.assertAllEqual(multiclass_scores_, distorted_multiclass_scores_)
self.assertAllEqual(multiclass_scores_rank_,
distorted_multiclass_scores_rank_)
self.assertAllEqual(distorted_boxes_.shape[0],
distorted_multiclass_scores_.shape[0])
def testSSDRandomCropPad(self):
images = self.createTestImages()
......
item {
name: "bend/bow (at the waist)"
id: 1
}
item {
name: "crouch/kneel"
id: 3
}
item {
name: "dance"
id: 4
}
item {
name: "fall down"
id: 5
}
item {
name: "get up"
id: 6
}
item {
name: "jump/leap"
id: 7
}
item {
name: "lie/sleep"
id: 8
}
item {
name: "martial art"
id: 9
}
item {
name: "run/jog"
id: 10
}
item {
name: "sit"
id: 11
}
item {
name: "stand"
id: 12
}
item {
name: "swim"
id: 13
}
item {
name: "walk"
id: 14
}
item {
name: "answer phone"
id: 15
}
item {
name: "carry/hold (an object)"
id: 17
}
item {
name: "climb (e.g., a mountain)"
id: 20
}
item {
name: "close (e.g., a door, a box)"
id: 22
}
item {
name: "cut"
id: 24
}
item {
name: "dress/put on clothing"
id: 26
}
item {
name: "drink"
id: 27
}
item {
name: "drive (e.g., a car, a truck)"
id: 28
}
item {
name: "eat"
id: 29
}
item {
name: "enter"
id: 30
}
item {
name: "hit (an object)"
id: 34
}
item {
name: "lift/pick up"
id: 36
}
item {
name: "listen (e.g., to music)"
id: 37
}
item {
name: "open (e.g., a window, a car door)"
id: 38
}
item {
name: "play musical instrument"
id: 41
}
item {
name: "point to (an object)"
id: 43
}
item {
name: "pull (an object)"
id: 45
}
item {
name: "push (an object)"
id: 46
}
item {
name: "put down"
id: 47
}
item {
name: "read"
id: 48
}
item {
name: "ride (e.g., a bike, a car, a horse)"
id: 49
}
item {
name: "sail boat"
id: 51
}
item {
name: "shoot"
id: 52
}
item {
name: "smoke"
id: 54
}
item {
name: "take a photo"
id: 56
}
item {
name: "text on/look at a cellphone"
id: 57
}
item {
name: "throw"
id: 58
}
item {
name: "touch (an object)"
id: 59
}
item {
name: "turn (e.g., a screwdriver)"
id: 60
}
item {
name: "watch (e.g., TV)"
id: 61
}
item {
name: "work on a computer"
id: 62
}
item {
name: "write"
id: 63
}
item {
name: "fight/hit (a person)"
id: 64
}
item {
name: "give/serve (an object) to (a person)"
id: 65
}
item {
name: "grab (a person)"
id: 66
}
item {
name: "hand clap"
id: 67
}
item {
name: "hand shake"
id: 68
}
item {
name: "hand wave"
id: 69
}
item {
name: "hug (a person)"
id: 70
}
item {
name: "kiss (a person)"
id: 72
}
item {
name: "lift (a person)"
id: 73
}
item {
name: "listen to (a person)"
id: 74
}
item {
name: "push (another person)"
id: 76
}
item {
name: "sing to (e.g., self, a person, a group)"
id: 77
}
item {
name: "take (an object) from (a person)"
id: 78
}
item {
name: "talk to (e.g., self, a person, a group)"
id: 79
}
item {
name: "watch (a person)"
id: 80
}
......@@ -111,7 +111,8 @@ class TfExampleDecoder(data_decoder.DataDecoder):
instance_mask_type=input_reader_pb2.NUMERICAL_MASKS,
label_map_proto_file=None,
use_display_name=False,
dct_method=''):
dct_method='',
num_keypoints=0):
"""Constructor sets keys_to_features and items_to_handlers.
Args:
......@@ -131,6 +132,7 @@ class TfExampleDecoder(data_decoder.DataDecoder):
algorithm used for jpeg decompression. Currently valid values
are ['INTEGER_FAST', 'INTEGER_ACCURATE']. The hint may be ignored, for
example, the jpeg library does not have that specific option.
num_keypoints: the number of keypoints per object.
Raises:
ValueError: If `instance_mask_type` option is not one of
......@@ -149,9 +151,9 @@ class TfExampleDecoder(data_decoder.DataDecoder):
'image/source_id':
tf.FixedLenFeature((), tf.string, default_value=''),
'image/height':
tf.FixedLenFeature((), tf.int64, 1),
tf.FixedLenFeature((), tf.int64, default_value=1),
'image/width':
tf.FixedLenFeature((), tf.int64, 1),
tf.FixedLenFeature((), tf.int64, default_value=1),
# Object boxes and classes.
'image/object/bbox/xmin':
tf.VarLenFeature(tf.float32),
......@@ -209,6 +211,16 @@ class TfExampleDecoder(data_decoder.DataDecoder):
fields.InputDataFields.groundtruth_weights: (
slim_example_decoder.Tensor('image/object/weight')),
}
self._num_keypoints = num_keypoints
if num_keypoints > 0:
self.keys_to_features['image/object/keypoint/x'] = (
tf.VarLenFeature(tf.float32))
self.keys_to_features['image/object/keypoint/y'] = (
tf.VarLenFeature(tf.float32))
self.items_to_handlers[fields.InputDataFields.groundtruth_keypoints] = (
slim_example_decoder.ItemHandlerCallback(
['image/object/keypoint/y', 'image/object/keypoint/x'],
self._reshape_keypoints))
if load_instance_masks:
if instance_mask_type in (input_reader_pb2.DEFAULT,
input_reader_pb2.NUMERICAL_MASKS):
......@@ -286,6 +298,9 @@ class TfExampleDecoder(data_decoder.DataDecoder):
[None] indicating if the boxes represent `difficult` instances.
fields.InputDataFields.groundtruth_group_of - 1D bool tensor of shape
[None] indicating if the boxes represent `group_of` instances.
fields.InputDataFields.groundtruth_keypoints - 3D float32 tensor of
shape [None, None, 2] containing keypoints, where the coordinates of
the keypoints are ordered (y, x).
fields.InputDataFields.groundtruth_instance_masks - 3D float32 tensor of
shape [None, None, None] containing instance masks.
"""
......@@ -314,6 +329,31 @@ class TfExampleDecoder(data_decoder.DataDecoder):
default_groundtruth_weights)
return tensor_dict
def _reshape_keypoints(self, keys_to_tensors):
"""Reshape keypoints.
The instance segmentation masks are reshaped to [num_instances,
num_keypoints, 2].
Args:
keys_to_tensors: a dictionary from keys to tensors.
Returns:
A 3-D float tensor of shape [num_instances, num_keypoints, 2] with values
in {0, 1}.
"""
y = keys_to_tensors['image/object/keypoint/y']
if isinstance(y, tf.SparseTensor):
y = tf.sparse_tensor_to_dense(y)
y = tf.expand_dims(y, 1)
x = keys_to_tensors['image/object/keypoint/x']
if isinstance(x, tf.SparseTensor):
x = tf.sparse_tensor_to_dense(x)
x = tf.expand_dims(x, 1)
keypoints = tf.concat([y, x], 1)
keypoints = tf.reshape(keypoints, [-1, self._num_keypoints, 2])
return keypoints
def _reshape_instance_masks(self, keys_to_tensors):
"""Reshape instance segmentation masks.
......
......@@ -304,6 +304,50 @@ class TfExampleDecoderTest(tf.test.TestCase):
self.assertAllEqual(
2, tensor_dict[fields.InputDataFields.num_groundtruth_boxes])
def testDecodeKeypoint(self):
image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
encoded_jpeg = self._EncodeImage(image_tensor)
bbox_ymins = [0.0, 4.0]
bbox_xmins = [1.0, 5.0]
bbox_ymaxs = [2.0, 6.0]
bbox_xmaxs = [3.0, 7.0]
keypoint_ys = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
keypoint_xs = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
example = tf.train.Example(features=tf.train.Features(feature={
'image/encoded': self._BytesFeature(encoded_jpeg),
'image/format': self._BytesFeature('jpeg'),
'image/object/bbox/ymin': self._FloatFeature(bbox_ymins),
'image/object/bbox/xmin': self._FloatFeature(bbox_xmins),
'image/object/bbox/ymax': self._FloatFeature(bbox_ymaxs),
'image/object/bbox/xmax': self._FloatFeature(bbox_xmaxs),
'image/object/keypoint/y': self._FloatFeature(keypoint_ys),
'image/object/keypoint/x': self._FloatFeature(keypoint_xs),
})).SerializeToString()
example_decoder = tf_example_decoder.TfExampleDecoder(num_keypoints=3)
tensor_dict = example_decoder.decode(tf.convert_to_tensor(example))
self.assertAllEqual((tensor_dict[fields.InputDataFields.groundtruth_boxes].
get_shape().as_list()), [None, 4])
self.assertAllEqual((tensor_dict[fields.InputDataFields.
groundtruth_keypoints].
get_shape().as_list()), [None, 3, 2])
with self.test_session() as sess:
tensor_dict = sess.run(tensor_dict)
expected_boxes = np.vstack([bbox_ymins, bbox_xmins,
bbox_ymaxs, bbox_xmaxs]).transpose()
self.assertAllEqual(expected_boxes,
tensor_dict[fields.InputDataFields.groundtruth_boxes])
self.assertAllEqual(
2, tensor_dict[fields.InputDataFields.num_groundtruth_boxes])
expected_keypoints = (
np.vstack([keypoint_ys, keypoint_xs]).transpose().reshape((2, 3, 2)))
self.assertAllEqual(expected_keypoints,
tensor_dict[
fields.InputDataFields.groundtruth_keypoints])
def testDecodeDefaultGroundtruthWeights(self):
image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
encoded_jpeg = self._EncodeImage(image_tensor)
......
......@@ -91,7 +91,7 @@ Some remarks on frozen inference graphs:
## Kitti-trained models {#kitti-models}
Model name | Speed (ms) | Pascal mAP@0.5 (ms) | Outputs
Model name | Speed (ms) | Pascal mAP@0.5 | Outputs
----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
[faster_rcnn_resnet101_kitti](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_kitti_2018_01_28.tar.gz) | 79 | 87 | Boxes
......@@ -103,6 +103,13 @@ Model name
[faster_rcnn_inception_resnet_v2_atrous_lowproposals_oid](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_lowproposals_oid_2018_01_28.tar.gz) | 347 | | Boxes
## AVA v2.1 trained models {#ava-models}
Model name | Speed (ms) | Pascal mAP@0.5 | Outputs
----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
[faster_rcnn_resnet101_ava_v2.1](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_ava_v2.1_2018_04_30.tar.gz) | 93 | 11 | Boxes
[^1]: See [MSCOCO evaluation protocol](http://cocodataset.org/#detections-eval).
[^2]: This is PASCAL mAP with a slightly different way of true positives computation: see [Open Images evaluation protocol](evaluation_protocols.md#open-images).
......@@ -4,7 +4,7 @@
Tensorflow Object Detection API depends on the following libraries:
* Protobuf 2.6
* Protobuf 3+
* Python-tk
* Pillow 1.0
* lxml
......
......@@ -599,9 +599,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
(and if number_of_stages > 1):
7) refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, 4] representing predicted
(final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals
[total_num_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals. If using
a shared box across classes the shape will instead be
[total_num_proposals, 1, self._box_coder.code_size].
8) class_predictions_with_background: a 3-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
......@@ -712,9 +714,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, 4] representing predicted
(final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals
[total_num_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals. If using a
shared box across classes the shape will instead be
[total_num_proposals, 1, self._box_coder.code_size].
2) class_predictions_with_background: a 3-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
......@@ -791,9 +795,11 @@ class FasterRCNNMetaArch(model.DetectionModel):
Args:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, 4] representing predicted
(final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals
[total_num_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals. If using a
shared box across classes the shape will instead be
[total_num_proposals, 1, self._box_coder.code_size].
2) class_predictions_with_background: a 3-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
......@@ -823,13 +829,13 @@ class FasterRCNNMetaArch(model.DetectionModel):
if self._is_training:
curr_box_classifier_features = prediction_dict['box_classifier_features']
detection_classes = prediction_dict['class_predictions_with_background']
box_predictions = self._mask_rcnn_box_predictor.predict(
mask_predictions = self._mask_rcnn_box_predictor.predict(
[curr_box_classifier_features],
num_predictions_per_location=[1],
scope=self.second_stage_box_predictor_scope,
predict_boxes_and_classes=False,
predict_auxiliary_outputs=True)
prediction_dict['mask_predictions'] = tf.squeeze(box_predictions[
prediction_dict['mask_predictions'] = tf.squeeze(mask_predictions[
box_predictor.MASK_PREDICTIONS], axis=1)
else:
detections_dict = self._postprocess_box_classifier(
......@@ -854,14 +860,14 @@ class FasterRCNNMetaArch(model.DetectionModel):
flattened_detected_feature_maps,
scope=self.second_stage_feature_extractor_scope))
box_predictions = self._mask_rcnn_box_predictor.predict(
mask_predictions = self._mask_rcnn_box_predictor.predict(
[curr_box_classifier_features],
num_predictions_per_location=[1],
scope=self.second_stage_box_predictor_scope,
predict_boxes_and_classes=False,
predict_auxiliary_outputs=True)
detection_masks = tf.squeeze(box_predictions[
detection_masks = tf.squeeze(mask_predictions[
box_predictor.MASK_PREDICTIONS], axis=1)
_, num_classes, mask_height, mask_width = (
......@@ -1098,8 +1104,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
tf.to_float(num_proposals),
}
# TODO(jrru): Remove mask_predictions from _post_process_box_classifier.
with tf.name_scope('SecondStagePostprocessor'):
if self._number_of_stages == 2:
if (self._number_of_stages == 2 or
(self._number_of_stages == 3 and self._is_training)):
mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS)
detections_dict = self._postprocess_box_classifier(
prediction_dict['refined_box_encodings'],
......@@ -1438,8 +1446,10 @@ class FasterRCNNMetaArch(model.DetectionModel):
Args:
refined_box_encodings: a 3-D float tensor with shape
[total_num_padded_proposals, num_classes, 4] representing predicted
(final) refined box encodings.
[total_num_padded_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings. If using a shared
box across classes the shape will instead be
[total_num_padded_proposals, 1, 4]
class_predictions_with_background: a 3-D tensor float with shape
[total_num_padded_proposals, num_classes + 1] containing class
predictions (logits) for each of the proposals. Note that this tensor
......@@ -1466,10 +1476,12 @@ class FasterRCNNMetaArch(model.DetectionModel):
that a pixel-wise sigmoid score converter is applied to the detection
masks.
"""
refined_box_encodings_batch = tf.reshape(refined_box_encodings,
[-1, self.max_num_proposals,
self.num_classes,
self._box_coder.code_size])
refined_box_encodings_batch = tf.reshape(
refined_box_encodings,
[-1,
self.max_num_proposals,
refined_box_encodings.shape[1],
self._box_coder.code_size])
class_predictions_with_background_batch = tf.reshape(
class_predictions_with_background,
[-1, self.max_num_proposals, self.num_classes + 1]
......@@ -1517,13 +1529,18 @@ class FasterRCNNMetaArch(model.DetectionModel):
box_encodings: a 4-D tensor with shape
[batch_size, num_anchors, num_classes, self._box_coder.code_size]
representing box encodings.
anchor_boxes: [batch_size, num_anchors, 4] representing
decoded bounding boxes.
anchor_boxes: [batch_size, num_anchors, self._box_coder.code_size]
representing decoded bounding boxes. If using a shared box across
classes the shape will instead be
[total_num_proposals, 1, self._box_coder.code_size].
Returns:
decoded_boxes: a [batch_size, num_anchors, num_classes, 4]
float tensor representing bounding box predictions
(for each image in batch, proposal and class).
decoded_boxes: a
[batch_size, num_anchors, num_classes, self._box_coder.code_size]
float tensor representing bounding box predictions (for each image in
batch, proposal and class). If using a shared box across classes the
shape will instead be
[batch_size, num_anchors, 1, self._box_coder.code_size].
"""
combined_shape = shape_utils.combined_static_and_dynamic_shape(
box_encodings)
......@@ -1697,7 +1714,9 @@ class FasterRCNNMetaArch(model.DetectionModel):
Args:
refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, box_coder.code_size] representing
predicted (final) refined box encodings.
predicted (final) refined box encodings. If using a shared box across
classes this will instead have shape
[total_num_proposals, 1, box_coder.code_size].
class_predictions_with_background: a 2-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors. Note that this tensor
......@@ -1748,31 +1767,39 @@ class FasterRCNNMetaArch(model.DetectionModel):
self._detector_target_assigner, proposal_boxlists,
groundtruth_boxlists, groundtruth_classes_with_background_list)
# We only predict refined location encodings for the non background
# classes, but we now pad it to make it compatible with the class
# predictions
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
[batch_size, self.max_num_proposals, -1])
flat_cls_targets_with_background = tf.reshape(
batch_cls_targets_with_background,
[batch_size * self.max_num_proposals, -1])
refined_box_encodings_with_background = tf.pad(
refined_box_encodings, [[0, 0], [1, 0], [0, 0]])
# For anchors with multiple labels, picks refined_location_encodings
# for just one class to avoid over-counting for regression loss and
# (optionally) mask loss.
one_hot_flat_cls_targets_with_background = tf.argmax(
flat_cls_targets_with_background, axis=1)
one_hot_flat_cls_targets_with_background = tf.one_hot(
one_hot_flat_cls_targets_with_background,
flat_cls_targets_with_background.get_shape()[1])
refined_box_encodings_masked_by_class_targets = tf.boolean_mask(
refined_box_encodings_with_background,
tf.greater(one_hot_flat_cls_targets_with_background, 0))
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
[batch_size, self.max_num_proposals, -1])
reshaped_refined_box_encodings = tf.reshape(
refined_box_encodings_masked_by_class_targets,
[batch_size, -1, 4])
# If using a shared box across classes use directly
if refined_box_encodings.shape[1] == 1:
reshaped_refined_box_encodings = tf.reshape(
refined_box_encodings,
[batch_size, self.max_num_proposals, self._box_coder.code_size])
# For anchors with multiple labels, picks refined_location_encodings
# for just one class to avoid over-counting for regression loss and
# (optionally) mask loss.
else:
# We only predict refined location encodings for the non background
# classes, but we now pad it to make it compatible with the class
# predictions
refined_box_encodings_with_background = tf.pad(
refined_box_encodings, [[0, 0], [1, 0], [0, 0]])
refined_box_encodings_masked_by_class_targets = tf.boolean_mask(
refined_box_encodings_with_background,
tf.greater(one_hot_flat_cls_targets_with_background, 0))
reshaped_refined_box_encodings = tf.reshape(
refined_box_encodings_masked_by_class_targets,
[batch_size, self.max_num_proposals, self._box_coder.code_size])
second_stage_loc_losses = self._second_stage_localization_loss(
reshaped_refined_box_encodings,
......
......@@ -85,6 +85,46 @@ class FasterRCNNMetaArchTest(
self.assertTrue(np.amax(detections_out['detection_masks'] <= 1.0))
self.assertTrue(np.amin(detections_out['detection_masks'] >= 0.0))
def test_postprocess_second_stage_only_inference_mode_with_shared_boxes(self):
model = self._build_model(
is_training=False, number_of_stages=2, second_stage_batch_size=6)
batch_size = 2
total_num_padded_proposals = batch_size * model.max_num_proposals
proposal_boxes = tf.constant(
[[[1, 1, 2, 3],
[0, 0, 1, 1],
[.5, .5, .6, .6],
4*[0], 4*[0], 4*[0], 4*[0], 4*[0]],
[[2, 3, 6, 8],
[1, 2, 5, 3],
4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]], dtype=tf.float32)
num_proposals = tf.constant([3, 2], dtype=tf.int32)
# This has 1 box instead of one for each class.
refined_box_encodings = tf.zeros(
[total_num_padded_proposals, 1, 4], dtype=tf.float32)
class_predictions_with_background = tf.ones(
[total_num_padded_proposals, model.num_classes+1], dtype=tf.float32)
image_shape = tf.constant([batch_size, 36, 48, 3], dtype=tf.int32)
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
detections = model.postprocess({
'refined_box_encodings': refined_box_encodings,
'class_predictions_with_background': class_predictions_with_background,
'num_proposals': num_proposals,
'proposal_boxes': proposal_boxes,
'image_shape': image_shape,
}, true_image_shapes)
with self.test_session() as sess:
detections_out = sess.run(detections)
self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4])
self.assertAllClose(detections_out['detection_scores'],
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])
self.assertAllClose(detections_out['detection_classes'],
[[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]])
self.assertAllClose(detections_out['num_detections'], [5, 4])
@parameterized.parameters(
{'masks_are_class_agnostic': False},
{'masks_are_class_agnostic': True},
......
......@@ -1284,6 +1284,106 @@ class FasterRCNNMetaArchTestBase(tf.test.TestCase):
self.assertAllClose(loss_dict_out[
'Loss/BoxClassifierLoss/classification_loss'], 0)
def test_loss_full_with_shared_boxes(self):
model = self._build_model(
is_training=True, number_of_stages=2, second_stage_batch_size=6)
batch_size = 2
anchors = tf.constant(
[[0, 0, 16, 16],
[0, 16, 16, 32],
[16, 0, 32, 16],
[16, 16, 32, 32]], dtype=tf.float32)
rpn_box_encodings = tf.zeros(
[batch_size,
anchors.get_shape().as_list()[0],
BOX_CODE_SIZE], dtype=tf.float32)
# use different numbers for the objectness category to break ties in
# order of boxes returned by NMS
rpn_objectness_predictions_with_background = tf.constant([
[[-10, 13],
[10, -10],
[10, -11],
[-10, 12]],
[[10, -10],
[-10, 13],
[-10, 12],
[10, -11]]], dtype=tf.float32)
image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32)
num_proposals = tf.constant([6, 6], dtype=tf.int32)
proposal_boxes = tf.constant(
2 * [[[0, 0, 16, 16],
[0, 16, 16, 32],
[16, 0, 32, 16],
[16, 16, 32, 32],
[0, 0, 16, 16],
[0, 16, 16, 32]]], dtype=tf.float32)
refined_box_encodings = tf.zeros(
(batch_size * model.max_num_proposals,
1, # one box shared among all the classes
BOX_CODE_SIZE), dtype=tf.float32)
class_predictions_with_background = tf.constant(
[[-10, 10, -10], # first image
[10, -10, -10],
[10, -10, -10],
[-10, -10, 10],
[-10, 10, -10],
[10, -10, -10],
[10, -10, -10], # second image
[-10, 10, -10],
[-10, 10, -10],
[10, -10, -10],
[10, -10, -10],
[-10, 10, -10]], dtype=tf.float32)
mask_predictions_logits = 20 * tf.ones((batch_size *
model.max_num_proposals,
model.num_classes,
14, 14),
dtype=tf.float32)
groundtruth_boxes_list = [
tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32),
tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)]
groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32),
tf.constant([[1, 0], [1, 0]], dtype=tf.float32)]
# Set all elements of groundtruth mask to 1.0. In this case all proposal
# crops of the groundtruth masks should return a mask that covers the entire
# proposal. Thus, if mask_predictions_logits element values are all greater
# than 20, the loss should be zero.
groundtruth_masks_list = [tf.convert_to_tensor(np.ones((2, 32, 32)),
dtype=tf.float32),
tf.convert_to_tensor(np.ones((2, 32, 32)),
dtype=tf.float32)]
prediction_dict = {
'rpn_box_encodings': rpn_box_encodings,
'rpn_objectness_predictions_with_background':
rpn_objectness_predictions_with_background,
'image_shape': image_shape,
'anchors': anchors,
'refined_box_encodings': refined_box_encodings,
'class_predictions_with_background': class_predictions_with_background,
'proposal_boxes': proposal_boxes,
'num_proposals': num_proposals,
'mask_predictions': mask_predictions_logits
}
_, true_image_shapes = model.preprocess(tf.zeros(image_shape))
model.provide_groundtruth(groundtruth_boxes_list,
groundtruth_classes_list,
groundtruth_masks_list)
loss_dict = model.loss(prediction_dict, true_image_shapes)
with self.test_session() as sess:
loss_dict_out = sess.run(loss_dict)
self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], 0)
self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0)
self.assertAllClose(loss_dict_out[
'Loss/BoxClassifierLoss/localization_loss'], 0)
self.assertAllClose(loss_dict_out[
'Loss/BoxClassifierLoss/classification_loss'], 0)
self.assertAllClose(loss_dict_out['Loss/BoxClassifierLoss/mask_loss'], 0)
def test_restore_map_for_classification_ckpt(self):
# Define mock tensorflow classification graph and save variables.
test_graph_classification = tf.Graph()
......
......@@ -203,26 +203,39 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
def get_estimator_eval_metric_ops(self, image_id, groundtruth_boxes,
groundtruth_classes, detection_boxes,
detection_scores, detection_classes):
detection_scores, detection_classes,
num_gt_boxes_per_image=None,
num_det_boxes_per_image=None):
"""Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`.
Note that once value_op is called, the detections and groundtruth added via
update_op are cleared.
This function can take in groundtruth and detections for a batch of images,
or for a single image. For the latter case, the batch dimension for input
tensors need not be present.
Args:
image_id: Unique string/integer identifier for the image.
groundtruth_boxes: float32 tensor of shape [num_boxes, 4] containing
`num_boxes` groundtruth boxes of the format
image_id: string/integer tensor of shape [batch] with unique identifiers
for the images.
groundtruth_boxes: float32 tensor of shape [batch, num_boxes, 4]
containing `num_boxes` groundtruth boxes of the format
[ymin, xmin, ymax, xmax] in absolute image coordinates.
groundtruth_classes: int32 tensor of shape [num_boxes] containing
groundtruth_classes: int32 tensor of shape [batch, num_boxes] containing
1-indexed groundtruth classes for the boxes.
detection_boxes: float32 tensor of shape [num_boxes, 4] containing
detection_boxes: float32 tensor of shape [batch, num_boxes, 4] containing
`num_boxes` detection boxes of the format [ymin, xmin, ymax, xmax]
in absolute image coordinates.
detection_scores: float32 tensor of shape [num_boxes] containing
detection_scores: float32 tensor of shape [batch, num_boxes] containing
detection scores for the boxes.
detection_classes: int32 tensor of shape [num_boxes] containing
detection_classes: int32 tensor of shape [batch, num_boxes] containing
1-indexed detection classes for the boxes.
num_gt_boxes_per_image: int32 tensor of shape [batch] containing the
number of groundtruth boxes per image. If None, will assume no padding
in groundtruth tensors.
num_det_boxes_per_image: int32 tensor of shape [batch] containing the
number of detection boxes per image. If None, will assume no padding in
the detection tensors.
Returns:
a dictionary of metric names to tuple of value_op and update_op that can
......@@ -231,28 +244,68 @@ class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator):
guarantee correct behaviour.
"""
def update_op(
image_id,
groundtruth_boxes,
groundtruth_classes,
detection_boxes,
detection_scores,
detection_classes):
self.add_single_ground_truth_image_info(
image_id,
{'groundtruth_boxes': groundtruth_boxes,
'groundtruth_classes': groundtruth_classes})
self.add_single_detected_image_info(
image_id,
{'detection_boxes': detection_boxes,
'detection_scores': detection_scores,
'detection_classes': detection_classes})
image_id_batched,
groundtruth_boxes_batched,
groundtruth_classes_batched,
num_gt_boxes_per_image,
detection_boxes_batched,
detection_scores_batched,
detection_classes_batched,
num_det_boxes_per_image):
"""Update operation for adding batch of images to Coco evaluator."""
for (image_id, gt_box, gt_class, num_gt_box, det_box, det_score,
det_class, num_det_box) in zip(
image_id_batched, groundtruth_boxes_batched,
groundtruth_classes_batched, num_gt_boxes_per_image,
detection_boxes_batched, detection_scores_batched,
detection_classes_batched, num_det_boxes_per_image):
self.add_single_ground_truth_image_info(
image_id,
{'groundtruth_boxes': gt_box[:num_gt_box],
'groundtruth_classes': gt_class[:num_gt_box]})
self.add_single_detected_image_info(
image_id,
{'detection_boxes': det_box[:num_det_box],
'detection_scores': det_score[:num_det_box],
'detection_classes': det_class[:num_det_box]})
if not image_id.shape.as_list():
# Apply a batch dimension to all tensors.
image_id = tf.expand_dims(image_id, 0)
groundtruth_boxes = tf.expand_dims(groundtruth_boxes, 0)
groundtruth_classes = tf.expand_dims(groundtruth_classes, 0)
detection_boxes = tf.expand_dims(detection_boxes, 0)
detection_scores = tf.expand_dims(detection_scores, 0)
detection_classes = tf.expand_dims(detection_classes, 0)
if num_gt_boxes_per_image is None:
num_gt_boxes_per_image = tf.shape(groundtruth_boxes)[1:2]
else:
num_gt_boxes_per_image = tf.expand_dims(num_gt_boxes_per_image, 0)
if num_det_boxes_per_image is None:
num_det_boxes_per_image = tf.shape(detection_boxes)[1:2]
else:
num_det_boxes_per_image = tf.expand_dims(num_det_boxes_per_image, 0)
else:
if num_gt_boxes_per_image is None:
num_gt_boxes_per_image = tf.tile(
tf.shape(groundtruth_boxes)[1:2],
multiples=tf.shape(groundtruth_boxes)[0:1])
if num_det_boxes_per_image is None:
num_det_boxes_per_image = tf.tile(
tf.shape(detection_boxes)[1:2],
multiples=tf.shape(detection_boxes)[0:1])
update_op = tf.py_func(update_op, [image_id,
groundtruth_boxes,
groundtruth_classes,
num_gt_boxes_per_image,
detection_boxes,
detection_scores,
detection_classes], [])
detection_classes,
num_det_boxes_per_image], [])
metric_names = ['DetectionBoxes_Precision/mAP',
'DetectionBoxes_Precision/mAP@.50IOU',
'DetectionBoxes_Precision/mAP@.75IOU',
......@@ -583,5 +636,3 @@ class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator):
eval_metric_ops[metric_name] = (tf.py_func(
value_func_factory(metric_name), [], np.float32), update_op)
return eval_metric_ops
......@@ -317,6 +317,230 @@ class CocoEvaluationPyFuncTest(tf.test.TestCase):
self.assertFalse(coco_evaluator._detection_boxes_list)
self.assertFalse(coco_evaluator._image_ids)
def testGetOneMAPWithMatchingGroundtruthAndDetectionsPadded(self):
category_list = [{
'id': 0,
'name': 'person'
}, {
'id': 1,
'name': 'cat'
}, {
'id': 2,
'name': 'dog'
}]
coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
image_id = tf.placeholder(tf.string, shape=())
groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4))
groundtruth_classes = tf.placeholder(tf.float32, shape=(None))
detection_boxes = tf.placeholder(tf.float32, shape=(None, 4))
detection_scores = tf.placeholder(tf.float32, shape=(None))
detection_classes = tf.placeholder(tf.float32, shape=(None))
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
image_id, groundtruth_boxes, groundtruth_classes, detection_boxes,
detection_scores, detection_classes)
_, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
with self.test_session() as sess:
sess.run(
update_op,
feed_dict={
image_id:
'image1',
groundtruth_boxes:
np.array([[100., 100., 200., 200.], [-1, -1, -1, -1]]),
groundtruth_classes:
np.array([1, -1]),
detection_boxes:
np.array([[100., 100., 200., 200.], [0., 0., 0., 0.]]),
detection_scores:
np.array([.8, 0.]),
detection_classes:
np.array([1, -1])
})
sess.run(
update_op,
feed_dict={
image_id:
'image2',
groundtruth_boxes:
np.array([[50., 50., 100., 100.], [-1, -1, -1, -1]]),
groundtruth_classes:
np.array([3, -1]),
detection_boxes:
np.array([[50., 50., 100., 100.], [0., 0., 0., 0.]]),
detection_scores:
np.array([.7, 0.]),
detection_classes:
np.array([3, -1])
})
sess.run(
update_op,
feed_dict={
image_id:
'image3',
groundtruth_boxes:
np.array([[25., 25., 50., 50.], [10., 10., 15., 15.]]),
groundtruth_classes:
np.array([2, 2]),
detection_boxes:
np.array([[25., 25., 50., 50.], [10., 10., 15., 15.]]),
detection_scores:
np.array([.95, .9]),
detection_classes:
np.array([2, 2])
})
metrics = {}
for key, (value_op, _) in eval_metric_ops.iteritems():
metrics[key] = value_op
metrics = sess.run(metrics)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
-1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.75)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
-1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
self.assertFalse(coco_evaluator._groundtruth_list)
self.assertFalse(coco_evaluator._detection_boxes_list)
self.assertFalse(coco_evaluator._image_ids)
def testGetOneMAPWithMatchingGroundtruthAndDetectionsBatched(self):
category_list = [{'id': 0, 'name': 'person'},
{'id': 1, 'name': 'cat'},
{'id': 2, 'name': 'dog'}]
coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
batch_size = 3
image_id = tf.placeholder(tf.string, shape=(batch_size))
groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
groundtruth_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
detection_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
detection_scores = tf.placeholder(tf.float32, shape=(batch_size, None))
detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
image_id, groundtruth_boxes,
groundtruth_classes,
detection_boxes,
detection_scores,
detection_classes)
_, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
with self.test_session() as sess:
sess.run(update_op,
feed_dict={
image_id: ['image1', 'image2', 'image3'],
groundtruth_boxes: np.array([[[100., 100., 200., 200.]],
[[50., 50., 100., 100.]],
[[25., 25., 50., 50.]]]),
groundtruth_classes: np.array([[1], [3], [2]]),
detection_boxes: np.array([[[100., 100., 200., 200.]],
[[50., 50., 100., 100.]],
[[25., 25., 50., 50.]]]),
detection_scores: np.array([[.8], [.7], [.9]]),
detection_classes: np.array([[1], [3], [2]])
})
metrics = {}
for key, (value_op, _) in eval_metric_ops.iteritems():
metrics[key] = value_op
metrics = sess.run(metrics)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
-1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
-1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
self.assertFalse(coco_evaluator._groundtruth_list)
self.assertFalse(coco_evaluator._detection_boxes_list)
self.assertFalse(coco_evaluator._image_ids)
def testGetOneMAPWithMatchingGroundtruthAndDetectionsPaddedBatches(self):
category_list = [{'id': 0, 'name': 'person'},
{'id': 1, 'name': 'cat'},
{'id': 2, 'name': 'dog'}]
coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list)
batch_size = 3
image_id = tf.placeholder(tf.string, shape=(batch_size))
groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
groundtruth_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
num_gt_boxes_per_image = tf.placeholder(tf.int32, shape=(None))
detection_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4))
detection_scores = tf.placeholder(tf.float32, shape=(batch_size, None))
detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None))
num_det_boxes_per_image = tf.placeholder(tf.int32, shape=(None))
eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
image_id, groundtruth_boxes,
groundtruth_classes,
detection_boxes,
detection_scores,
detection_classes,
num_gt_boxes_per_image,
num_det_boxes_per_image)
_, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP']
with self.test_session() as sess:
sess.run(update_op,
feed_dict={
image_id: ['image1', 'image2', 'image3'],
groundtruth_boxes: np.array([[[100., 100., 200., 200.],
[-1, -1, -1, -1]],
[[50., 50., 100., 100.],
[-1, -1, -1, -1]],
[[25., 25., 50., 50.],
[10., 10., 15., 15.]]]),
groundtruth_classes: np.array([[1, -1], [3, -1], [2, 2]]),
num_gt_boxes_per_image: np.array([1, 1, 2]),
detection_boxes: np.array([[[100., 100., 200., 200.],
[0., 0., 0., 0.]],
[[50., 50., 100., 100.],
[0., 0., 0., 0.]],
[[25., 25., 50., 50.],
[10., 10., 15., 15.]]]),
detection_scores: np.array([[.8, 0.], [.7, 0.], [.95, .9]]),
detection_classes: np.array([[1, -1], [3, -1], [2, 2]]),
num_det_boxes_per_image: np.array([1, 1, 2]),
})
metrics = {}
for key, (value_op, _) in eval_metric_ops.iteritems():
metrics[key] = value_op
metrics = sess.run(metrics)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'],
-1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.75)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'],
-1.0)
self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0)
self.assertFalse(coco_evaluator._groundtruth_list)
self.assertFalse(coco_evaluator._detection_boxes_list)
self.assertFalse(coco_evaluator._image_ids)
class CocoMaskEvaluationTest(tf.test.TestCase):
......
......@@ -325,16 +325,16 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
}
eval_metric_ops = None
if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
if mode == tf.estimator.ModeKeys.EVAL:
class_agnostic = (fields.DetectionResultFields.detection_classes
not in detections)
groundtruth = _get_groundtruth_data(detection_model, class_agnostic)
use_original_images = fields.InputDataFields.original_image in features
original_images = (
eval_images = (
features[fields.InputDataFields.original_image] if use_original_images
else features[fields.InputDataFields.image])
eval_dict = eval_util.result_dict_for_single_example(
original_images[0:1],
eval_images[0:1],
features[inputs.HASH_KEY][0],
detections,
groundtruth,
......@@ -355,22 +355,21 @@ def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False):
img_summary = tf.summary.image('Detections_Left_Groundtruth_Right',
detection_and_groundtruth)
if mode == tf.estimator.ModeKeys.EVAL:
# Eval metrics on a single example.
eval_metrics = eval_config.metrics_set
if not eval_metrics:
eval_metrics = ['coco_detection_metrics']
eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
eval_metrics, category_index.values(), eval_dict,
include_metrics_per_category=False)
for loss_key, loss_tensor in iter(losses_dict.items()):
eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
for var in optimizer_summary_vars:
eval_metric_ops[var.op.name] = (var, tf.no_op())
if img_summary is not None:
eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
img_summary, tf.no_op())
eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()}
# Eval metrics on a single example.
eval_metrics = eval_config.metrics_set
if not eval_metrics:
eval_metrics = ['coco_detection_metrics']
eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
eval_metrics, category_index.values(), eval_dict,
include_metrics_per_category=False)
for loss_key, loss_tensor in iter(losses_dict.items()):
eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
for var in optimizer_summary_vars:
eval_metric_ops[var.op.name] = (var, tf.no_op())
if img_summary is not None:
eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
img_summary, tf.no_op())
eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()}
if use_tpu:
return tf.contrib.tpu.TPUEstimatorSpec(
......
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -36,6 +36,10 @@ flags.DEFINE_string(
'hparams_overrides', None, 'Hyperparameter overrides, '
'represented as a string containing comma-separated '
'hparam_name=value pairs.')
flags.DEFINE_string(
'checkpoint_dir', None, 'Path to directory holding a checkpoint. If '
'`checkpoint_dir` is provided, this binary operates in eval-only mode, '
'writing resulting metrics to `model_dir`.')
FLAGS = flags.FLAGS
......@@ -59,17 +63,23 @@ def main(unused_argv):
train_steps = train_and_eval_dict['train_steps']
eval_steps = train_and_eval_dict['eval_steps']
train_spec, eval_specs = model_lib.create_train_and_eval_specs(
train_input_fn,
eval_input_fn,
eval_on_train_input_fn,
predict_input_fn,
train_steps,
eval_steps,
eval_on_train_data=False)
if FLAGS.checkpoint_dir:
estimator.evaluate(eval_input_fn,
eval_steps,
checkpoint_path=tf.train.latest_checkpoint(
FLAGS.checkpoint_dir))
else:
train_spec, eval_specs = model_lib.create_train_and_eval_specs(
train_input_fn,
eval_input_fn,
eval_on_train_input_fn,
predict_input_fn,
train_steps,
eval_steps,
eval_on_train_data=False)
# Currently only a single Eval Spec is allowed.
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
# Currently only a single Eval Spec is allowed.
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
if __name__ == '__main__':
......
......@@ -56,7 +56,9 @@ flags.DEFINE_integer('iterations_per_loop', 100,
# recent checkpoint every 10 minutes by default for train_and_eval
flags.DEFINE_string('mode', 'train',
'Mode to run: train, eval')
flags.DEFINE_integer('train_batch_size', 32 * 8, 'Batch size for training.')
flags.DEFINE_integer('train_batch_size', None, 'Batch size for training. If '
'this is not provided, batch size is read from training '
'config.')
flags.DEFINE_string(
'hparams_overrides', None, 'Comma-separated list of '
......@@ -93,6 +95,10 @@ def main(unused_argv):
iterations_per_loop=FLAGS.iterations_per_loop,
num_shards=FLAGS.num_shards))
kwargs = {}
if FLAGS.train_batch_size:
kwargs['batch_size'] = FLAGS.train_batch_size
train_and_eval_dict = model_lib.create_estimator_and_inputs(
run_config=config,
hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
......@@ -102,7 +108,7 @@ def main(unused_argv):
use_tpu_estimator=True,
use_tpu=FLAGS.use_tpu,
num_shards=FLAGS.num_shards,
batch_size=FLAGS.train_batch_size)
**kwargs)
estimator = train_and_eval_dict['estimator']
train_input_fn = train_and_eval_dict['train_input_fn']
eval_input_fn = train_and_eval_dict['eval_input_fn']
......
......@@ -14,6 +14,8 @@
# ==============================================================================
"""Mobilenet v1 Faster R-CNN implementation."""
import numpy as np
import tensorflow as tf
from object_detection.meta_architectures import faster_rcnn_meta_arch
......@@ -23,22 +25,31 @@ from nets import mobilenet_v1
slim = tf.contrib.slim
_MOBILENET_V1_100_CONV_NO_LAST_STRIDE_DEFS = [
mobilenet_v1.Conv(kernel=[3, 3], stride=2, depth=32),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=64),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=128),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=128),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=256),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=256),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=512),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=512),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=1024),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=1024)
]
def _get_mobilenet_conv_no_last_stride_defs(conv_depth_ratio_in_percentage):
if conv_depth_ratio_in_percentage not in [25, 50, 75, 100]:
raise ValueError(
'Only the following ratio percentages are supported: 25, 50, 75, 100')
conv_depth_ratio_in_percentage = float(conv_depth_ratio_in_percentage) / 100.0
channels = np.array([
32, 64, 128, 128, 256, 256, 512, 512, 512, 512, 512, 512, 1024, 1024
], dtype=np.float32)
channels = (channels * conv_depth_ratio_in_percentage).astype(np.int32)
return [
mobilenet_v1.Conv(kernel=[3, 3], stride=2, depth=channels[0]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[1]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=channels[2]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[3]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=channels[4]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[5]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=channels[6]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[7]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[8]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[9]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[10]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[11]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[12]),
mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[13])
]
class FasterRCNNMobilenetV1FeatureExtractor(
......@@ -53,7 +64,8 @@ class FasterRCNNMobilenetV1FeatureExtractor(
weight_decay=0.0,
depth_multiplier=1.0,
min_depth=16,
skip_last_stride=False):
skip_last_stride=False,
conv_depth_ratio_in_percentage=100):
"""Constructor.
Args:
......@@ -65,6 +77,8 @@ class FasterRCNNMobilenetV1FeatureExtractor(
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
skip_last_stride: Skip the last stride if True.
conv_depth_ratio_in_percentage: Conv depth ratio in percentage. Only
applied if skip_last_stride is True.
Raises:
ValueError: If `first_stage_features_stride` is not 8 or 16.
......@@ -74,6 +88,7 @@ class FasterRCNNMobilenetV1FeatureExtractor(
self._depth_multiplier = depth_multiplier
self._min_depth = min_depth
self._skip_last_stride = skip_last_stride
self._conv_depth_ratio_in_percentage = conv_depth_ratio_in_percentage
super(FasterRCNNMobilenetV1FeatureExtractor, self).__init__(
is_training, first_stage_features_stride, batch_norm_trainable,
reuse_weights, weight_decay)
......@@ -124,7 +139,9 @@ class FasterRCNNMobilenetV1FeatureExtractor(
reuse=self._reuse_weights) as scope:
params = {}
if self._skip_last_stride:
params['conv_defs'] = _MOBILENET_V1_100_CONV_NO_LAST_STRIDE_DEFS
params['conv_defs'] = _get_mobilenet_conv_no_last_stride_defs(
conv_depth_ratio_in_percentage=self.
_conv_depth_ratio_in_percentage)
_, activations = mobilenet_v1.mobilenet_v1_base(
preprocessed_inputs,
final_endpoint='Conv2d_11_pointwise',
......@@ -150,6 +167,11 @@ class FasterRCNNMobilenetV1FeatureExtractor(
"""
net = proposal_feature_maps
conv_depth = 1024
if self._skip_last_stride:
conv_depth_ratio = float(self._conv_depth_ratio_in_percentage) / 100.0
conv_depth = int(float(conv_depth) * conv_depth_ratio)
depth = lambda d: max(int(d * 1.0), 16)
with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights):
with slim.arg_scope(
......@@ -160,13 +182,13 @@ class FasterRCNNMobilenetV1FeatureExtractor(
[slim.conv2d, slim.separable_conv2d], padding='SAME'):
net = slim.separable_conv2d(
net,
depth(1024), [3, 3],
depth(conv_depth), [3, 3],
depth_multiplier=1,
stride=2,
scope='Conv2d_12_pointwise')
return slim.separable_conv2d(
net,
depth(1024), [3, 3],
depth(conv_depth), [3, 3],
depth_multiplier=1,
stride=1,
scope='Conv2d_13_pointwise')
......@@ -20,7 +20,7 @@ message ConvolutionalBoxPredictor {
// Hyperparameters for convolution ops used in the box predictor.
optional Hyperparams conv_hyperparams = 1;
// Minumum feature depth prior to predicting box encodings and class
// Minimum feature depth prior to predicting box encodings and class
// predictions.
optional int32 min_depth = 2 [default = 0];
......@@ -81,6 +81,12 @@ message WeightSharedConvolutionalBoxPredictor {
// training where there are large number of negative boxes. See
// https://arxiv.org/abs/1708.02002 for details.
optional float class_prediction_bias_init = 10 [default = 0.0];
// Whether to use dropout for class prediction.
optional bool use_dropout = 11 [default = false];
// Keep probability for dropout
optional float dropout_keep_probability = 12 [default = 0.8];
}
message MaskRCNNBoxPredictor {
......@@ -119,6 +125,10 @@ message MaskRCNNBoxPredictor {
// branch.
optional int32 mask_prediction_num_conv_layers = 11 [default = 2];
optional bool masks_are_class_agnostic = 12 [default = false];
// Whether to use one box for all classes rather than a different box for each
// class.
optional bool share_box_across_classes = 13 [default = false];
}
message RfcnBoxPredictor {
......
......@@ -60,6 +60,9 @@ message InputReader {
// Number of parallel decode ops to apply.
optional uint32 num_parallel_map_calls = 14 [default = 64];
// Number of groundtruth keypoints per object.
optional uint32 num_keypoints = 16 [default = 0];
// Whether to load groundtruth instance masks.
optional bool load_instance_masks = 7 [default = false];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment