Unverified Commit fd7b6887 authored by Jonathan Huang's avatar Jonathan Huang Committed by GitHub
Browse files

Merge pull request #3293 from pkulzc/master

Internal changes of object_detection 
parents f98ec55e 1efe98bb
......@@ -116,18 +116,17 @@ def build_faster_rcnn_classification_loss(loss_config):
loss_type = loss_config.WhichOneof('classification_loss')
if loss_type == 'weighted_sigmoid':
config = loss_config.weighted_sigmoid
return losses.WeightedSigmoidClassificationLoss(
anchorwise_output=config.anchorwise_output)
return losses.WeightedSigmoidClassificationLoss()
if loss_type == 'weighted_softmax':
config = loss_config.weighted_softmax
return losses.WeightedSoftmaxClassificationLoss(
anchorwise_output=config.anchorwise_output)
logit_scale=config.logit_scale)
# By default, Faster RCNN second stage classifier uses Softmax loss
# with anchor-wise outputs.
config = loss_config.weighted_softmax
return losses.WeightedSoftmaxClassificationLoss(
anchorwise_output=True)
logit_scale=config.logit_scale)
def _build_localization_loss(loss_config):
......@@ -148,14 +147,10 @@ def _build_localization_loss(loss_config):
loss_type = loss_config.WhichOneof('localization_loss')
if loss_type == 'weighted_l2':
config = loss_config.weighted_l2
return losses.WeightedL2LocalizationLoss(
anchorwise_output=config.anchorwise_output)
return losses.WeightedL2LocalizationLoss()
if loss_type == 'weighted_smooth_l1':
config = loss_config.weighted_smooth_l1
return losses.WeightedSmoothL1LocalizationLoss(
anchorwise_output=config.anchorwise_output)
return losses.WeightedSmoothL1LocalizationLoss()
if loss_type == 'weighted_iou':
return losses.WeightedIOULocalizationLoss()
......@@ -181,9 +176,7 @@ def _build_classification_loss(loss_config):
loss_type = loss_config.WhichOneof('classification_loss')
if loss_type == 'weighted_sigmoid':
config = loss_config.weighted_sigmoid
return losses.WeightedSigmoidClassificationLoss(
anchorwise_output=config.anchorwise_output)
return losses.WeightedSigmoidClassificationLoss()
if loss_type == 'weighted_sigmoid_focal':
config = loss_config.weighted_sigmoid_focal
......@@ -191,21 +184,18 @@ def _build_classification_loss(loss_config):
if config.HasField('alpha'):
alpha = config.alpha
return losses.SigmoidFocalClassificationLoss(
anchorwise_output=config.anchorwise_output,
gamma=config.gamma,
alpha=alpha)
if loss_type == 'weighted_softmax':
config = loss_config.weighted_softmax
return losses.WeightedSoftmaxClassificationLoss(
anchorwise_output=config.anchorwise_output,
logit_scale=config.logit_scale)
if loss_type == 'bootstrapped_sigmoid':
config = loss_config.bootstrapped_sigmoid
return losses.BootstrappedSigmoidClassificationLoss(
alpha=config.alpha,
bootstrap_type=('hard' if config.hard_bootstrap else 'soft'),
anchorwise_output=config.anchorwise_output)
bootstrap_type=('hard' if config.hard_bootstrap else 'soft'))
raise ValueError('Empty loss config.')
......@@ -80,7 +80,6 @@ class LocalizationLossBuilderTest(tf.test.TestCase):
losses_text_proto = """
localization_loss {
weighted_smooth_l1 {
anchorwise_output: true
}
}
classification_loss {
......@@ -245,7 +244,7 @@ class ClassificationLossBuilderTest(tf.test.TestCase):
targets = tf.constant([[[0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]])
weights = tf.constant([[1.0, 1.0]])
loss = classification_loss(predictions, targets, weights=weights)
self.assertEqual(loss.shape, [1, 2])
self.assertEqual(loss.shape, [1, 2, 3])
def test_raise_error_on_empty_config(self):
losses_text_proto = """
......
......@@ -45,7 +45,9 @@ def build(matcher_config):
matched_threshold=matched_threshold,
unmatched_threshold=unmatched_threshold,
negatives_lower_than_unmatched=matcher.negatives_lower_than_unmatched,
force_match_for_each_row=matcher.force_match_for_each_row)
force_match_for_each_row=matcher.force_match_for_each_row,
use_matmul_gather=matcher.use_matmul_gather)
if matcher_config.WhichOneof('matcher_oneof') == 'bipartite_matcher':
return bipartite_matcher.GreedyBipartiteMatcher()
matcher = matcher_config.bipartite_matcher
return bipartite_matcher.GreedyBipartiteMatcher(matcher.use_matmul_gather)
raise ValueError('Empty matcher.')
......@@ -62,6 +62,7 @@ class MatcherBuilderTest(tf.test.TestCase):
unmatched_threshold: 0.3
negatives_lower_than_unmatched: false
force_match_for_each_row: true
use_matmul_gather: true
}
"""
matcher_proto = matcher_pb2.Matcher()
......@@ -72,6 +73,7 @@ class MatcherBuilderTest(tf.test.TestCase):
self.assertAlmostEqual(matcher_object._unmatched_threshold, 0.3)
self.assertFalse(matcher_object._negatives_lower_than_unmatched)
self.assertTrue(matcher_object._force_match_for_each_row)
self.assertTrue(matcher_object._use_matmul_gather)
def test_build_bipartite_matcher(self):
matcher_text_proto = """
......
......@@ -31,6 +31,7 @@ from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extr
from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2
from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1
from object_detection.models import ssd_resnet_v1_fpn_feature_extractor as ssd_resnet_v1_fpn
from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor
from object_detection.models.ssd_inception_v2_feature_extractor import SSDInceptionV2FeatureExtractor
from object_detection.models.ssd_inception_v3_feature_extractor import SSDInceptionV3FeatureExtractor
......@@ -42,6 +43,9 @@ SSD_FEATURE_EXTRACTOR_CLASS_MAP = {
'ssd_inception_v2': SSDInceptionV2FeatureExtractor,
'ssd_inception_v3': SSDInceptionV3FeatureExtractor,
'ssd_mobilenet_v1': SSDMobileNetV1FeatureExtractor,
'ssd_resnet50_v1_fpn': ssd_resnet_v1_fpn.SSDResnet50V1FpnFeatureExtractor,
'ssd_resnet101_v1_fpn': ssd_resnet_v1_fpn.SSDResnet101V1FpnFeatureExtractor,
'ssd_resnet152_v1_fpn': ssd_resnet_v1_fpn.SSDResnet152V1FpnFeatureExtractor,
'embedded_ssd_mobilenet_v1': EmbeddedSSDMobileNetV1FeatureExtractor,
}
......@@ -62,13 +66,14 @@ FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP = {
}
def build(model_config, is_training):
def build(model_config, is_training, add_summaries=True):
"""Builds a DetectionModel based on the model config.
Args:
model_config: A model.proto object containing the config for the desired
DetectionModel.
is_training: True if this model is being built for training purposes.
add_summaries: Whether to add tensorflow summaries in the model graph.
Returns:
DetectionModel based on the config.
......@@ -80,9 +85,10 @@ def build(model_config, is_training):
raise ValueError('model_config not of type model_pb2.DetectionModel.')
meta_architecture = model_config.WhichOneof('model')
if meta_architecture == 'ssd':
return _build_ssd_model(model_config.ssd, is_training)
return _build_ssd_model(model_config.ssd, is_training, add_summaries)
if meta_architecture == 'faster_rcnn':
return _build_faster_rcnn_model(model_config.faster_rcnn, is_training)
return _build_faster_rcnn_model(model_config.faster_rcnn, is_training,
add_summaries)
raise ValueError('Unknown meta architecture: {}'.format(meta_architecture))
......@@ -106,6 +112,8 @@ def _build_ssd_feature_extractor(feature_extractor_config, is_training,
min_depth = feature_extractor_config.min_depth
pad_to_multiple = feature_extractor_config.pad_to_multiple
batch_norm_trainable = feature_extractor_config.batch_norm_trainable
use_explicit_padding = feature_extractor_config.use_explicit_padding
use_depthwise = feature_extractor_config.use_depthwise
conv_hyperparams = hyperparams_builder.build(
feature_extractor_config.conv_hyperparams, is_training)
......@@ -115,16 +123,18 @@ def _build_ssd_feature_extractor(feature_extractor_config, is_training,
feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type]
return feature_extractor_class(is_training, depth_multiplier, min_depth,
pad_to_multiple, conv_hyperparams,
batch_norm_trainable, reuse_weights)
batch_norm_trainable, reuse_weights,
use_explicit_padding, use_depthwise)
def _build_ssd_model(ssd_config, is_training):
def _build_ssd_model(ssd_config, is_training, add_summaries):
"""Builds an SSD detection model based on the model config.
Args:
ssd_config: A ssd.proto object containing the config for the desired
SSDMetaArch.
is_training: True if this model is being built for training purposes.
add_summaries: Whether to add tf summaries in the model.
Returns:
SSDMetaArch based on the config.
......@@ -171,7 +181,8 @@ def _build_ssd_model(ssd_config, is_training):
classification_weight,
localization_weight,
normalize_loss_by_num_matches,
hard_example_miner)
hard_example_miner,
add_summaries=add_summaries)
def _build_faster_rcnn_feature_extractor(
......@@ -205,7 +216,7 @@ def _build_faster_rcnn_feature_extractor(
batch_norm_trainable, reuse_weights)
def _build_faster_rcnn_model(frcnn_config, is_training):
def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
"""Builds a Faster R-CNN or R-FCN detection model based on the model config.
Builds R-FCN model if the second_stage_box_predictor in the config is of type
......@@ -213,8 +224,9 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
Args:
frcnn_config: A faster_rcnn.proto object containing the config for the
desired FasterRCNNMetaArch or RFCNMetaArch.
desired FasterRCNNMetaArch or RFCNMetaArch.
is_training: True if this model is being built for training purposes.
add_summaries: Whether to add tf summaries in the model.
Returns:
FasterRCNNMetaArch based on the config.
......@@ -228,7 +240,7 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
feature_extractor = _build_faster_rcnn_feature_extractor(
frcnn_config.feature_extractor, is_training)
first_stage_only = frcnn_config.first_stage_only
number_of_stages = frcnn_config.number_of_stages
first_stage_anchor_generator = anchor_generator_builder.build(
frcnn_config.first_stage_anchor_generator)
......@@ -283,7 +295,7 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
'num_classes': num_classes,
'image_resizer_fn': image_resizer_fn,
'feature_extractor': feature_extractor,
'first_stage_only': first_stage_only,
'number_of_stages': number_of_stages,
'first_stage_anchor_generator': first_stage_anchor_generator,
'first_stage_atrous_rate': first_stage_atrous_rate,
'first_stage_box_predictor_arg_scope':
......@@ -310,7 +322,8 @@ def _build_faster_rcnn_model(frcnn_config, is_training):
second_stage_classification_loss,
'second_stage_classification_loss_weight':
second_stage_classification_loss_weight,
'hard_example_miner': hard_example_miner}
'hard_example_miner': hard_example_miner,
'add_summaries': add_summaries}
if isinstance(second_stage_box_predictor, box_predictor.RfcnBoxPredictor):
return rfcn_meta_arch.RFCNMetaArch(
......
......@@ -26,12 +26,14 @@ from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extr
from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2
from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas
from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1
from object_detection.models import ssd_resnet_v1_fpn_feature_extractor as ssd_resnet_v1_fpn
from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor
from object_detection.models.ssd_inception_v2_feature_extractor import SSDInceptionV2FeatureExtractor
from object_detection.models.ssd_inception_v3_feature_extractor import SSDInceptionV3FeatureExtractor
from object_detection.models.ssd_mobilenet_v1_feature_extractor import SSDMobileNetV1FeatureExtractor
from object_detection.protos import model_pb2
FEATURE_EXTRACTOR_MAPS = {
FRCNN_RESNET_FEAT_MAPS = {
'faster_rcnn_resnet50':
frcnn_resnet_v1.FasterRCNNResnet50FeatureExtractor,
'faster_rcnn_resnet101':
......@@ -40,6 +42,15 @@ FEATURE_EXTRACTOR_MAPS = {
frcnn_resnet_v1.FasterRCNNResnet152FeatureExtractor
}
SSD_RESNET_V1_FPN_FEAT_MAPS = {
'ssd_resnet50_v1_fpn':
ssd_resnet_v1_fpn.SSDResnet50V1FpnFeatureExtractor,
'ssd_resnet101_v1_fpn':
ssd_resnet_v1_fpn.SSDResnet101V1FpnFeatureExtractor,
'ssd_resnet152_v1_fpn':
ssd_resnet_v1_fpn.SSDResnet152V1FpnFeatureExtractor
}
class ModelBuilderTest(tf.test.TestCase):
......@@ -197,6 +208,87 @@ class ModelBuilderTest(tf.test.TestCase):
self.assertIsInstance(model._feature_extractor,
SSDInceptionV3FeatureExtractor)
def test_create_ssd_resnet_v1_fpn_model_from_config(self):
model_text_proto = """
ssd {
feature_extractor {
type: 'ssd_resnet50_v1_fpn'
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
batch_norm_trainable: true
}
box_coder {
faster_rcnn_box_coder {
}
}
matcher {
argmax_matcher {
}
}
similarity_calculator {
iou_similarity {
}
}
anchor_generator {
multiscale_anchor_generator {
aspect_ratios: [1.0, 2.0, 0.5]
scales_per_octave: 2
}
}
image_resizer {
fixed_shape_resizer {
height: 320
width: 320
}
}
box_predictor {
weight_shared_convolutional_box_predictor {
depth: 32
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
num_layers_before_predictor: 1
}
}
loss {
classification_loss {
weighted_sigmoid_focal {
alpha: 0.25
gamma: 2.0
}
}
localization_loss {
weighted_smooth_l1 {
}
}
classification_weight: 1.0
localization_weight: 1.0
}
}"""
model_proto = model_pb2.DetectionModel()
text_format.Merge(model_text_proto, model_proto)
for extractor_type, extractor_class in SSD_RESNET_V1_FPN_FEAT_MAPS.items():
model_proto.ssd.feature_extractor.type = extractor_type
model = model_builder.build(model_proto, is_training=True)
self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch)
self.assertIsInstance(model._feature_extractor, extractor_class)
def test_create_ssd_mobilenet_v1_model_from_config(self):
model_text_proto = """
ssd {
......@@ -270,6 +362,78 @@ class ModelBuilderTest(tf.test.TestCase):
SSDMobileNetV1FeatureExtractor)
self.assertTrue(model._feature_extractor._batch_norm_trainable)
def test_create_embedded_ssd_mobilenet_v1_model_from_config(self):
model_text_proto = """
ssd {
feature_extractor {
type: 'embedded_ssd_mobilenet_v1'
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
batch_norm_trainable: true
}
box_coder {
faster_rcnn_box_coder {
}
}
matcher {
argmax_matcher {
}
}
similarity_calculator {
iou_similarity {
}
}
anchor_generator {
ssd_anchor_generator {
aspect_ratios: 1.0
}
}
image_resizer {
fixed_shape_resizer {
height: 256
width: 256
}
}
box_predictor {
convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
}
}
loss {
classification_loss {
weighted_softmax {
}
}
localization_loss {
weighted_smooth_l1 {
}
}
}
}"""
model_proto = model_pb2.DetectionModel()
text_format.Merge(model_text_proto, model_proto)
model = self.create_model(model_proto)
self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch)
self.assertIsInstance(model._feature_extractor,
EmbeddedSSDMobileNetV1FeatureExtractor)
def test_create_faster_rcnn_resnet_v1_models_from_config(self):
model_text_proto = """
faster_rcnn {
......@@ -331,7 +495,7 @@ class ModelBuilderTest(tf.test.TestCase):
}"""
model_proto = model_pb2.DetectionModel()
text_format.Merge(model_text_proto, model_proto)
for extractor_type, extractor_class in FEATURE_EXTRACTOR_MAPS.items():
for extractor_type, extractor_class in FRCNN_RESNET_FEAT_MAPS.items():
model_proto.faster_rcnn.feature_extractor.type = extractor_type
model = model_builder.build(model_proto, is_training=True)
self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch)
......@@ -730,7 +894,7 @@ class ModelBuilderTest(tf.test.TestCase):
}"""
model_proto = model_pb2.DetectionModel()
text_format.Merge(model_text_proto, model_proto)
for extractor_type, extractor_class in FEATURE_EXTRACTOR_MAPS.items():
for extractor_type, extractor_class in FRCNN_RESNET_FEAT_MAPS.items():
model_proto.faster_rcnn.feature_extractor.type = extractor_type
model = model_builder.build(model_proto, is_training=True)
self.assertIsInstance(model, rfcn_meta_arch.RFCNMetaArch)
......
......@@ -19,15 +19,14 @@ import tensorflow as tf
from object_detection.utils import learning_schedules
def build(optimizer_config, global_summaries):
def build(optimizer_config):
"""Create optimizer based on config.
Args:
optimizer_config: A Optimizer proto message.
global_summaries: A set to attach learning rate summary to.
Returns:
An optimizer.
An optimizer and a list of variables for summary.
Raises:
ValueError: when using an unsupported input data type.
......@@ -35,24 +34,30 @@ def build(optimizer_config, global_summaries):
optimizer_type = optimizer_config.WhichOneof('optimizer')
optimizer = None
summary_vars = []
if optimizer_type == 'rms_prop_optimizer':
config = optimizer_config.rms_prop_optimizer
learning_rate = _create_learning_rate(config.learning_rate)
summary_vars.append(learning_rate)
optimizer = tf.train.RMSPropOptimizer(
_create_learning_rate(config.learning_rate, global_summaries),
learning_rate,
decay=config.decay,
momentum=config.momentum_optimizer_value,
epsilon=config.epsilon)
if optimizer_type == 'momentum_optimizer':
config = optimizer_config.momentum_optimizer
learning_rate = _create_learning_rate(config.learning_rate)
summary_vars.append(learning_rate)
optimizer = tf.train.MomentumOptimizer(
_create_learning_rate(config.learning_rate, global_summaries),
learning_rate,
momentum=config.momentum_optimizer_value)
if optimizer_type == 'adam_optimizer':
config = optimizer_config.adam_optimizer
optimizer = tf.train.AdamOptimizer(
_create_learning_rate(config.learning_rate, global_summaries))
learning_rate = _create_learning_rate(config.learning_rate)
summary_vars.append(learning_rate)
optimizer = tf.train.AdamOptimizer(learning_rate)
if optimizer is None:
raise ValueError('Optimizer %s not supported.' % optimizer_type)
......@@ -61,15 +66,14 @@ def build(optimizer_config, global_summaries):
optimizer = tf.contrib.opt.MovingAverageOptimizer(
optimizer, average_decay=optimizer_config.moving_average_decay)
return optimizer
return optimizer, summary_vars
def _create_learning_rate(learning_rate_config, global_summaries):
def _create_learning_rate(learning_rate_config):
"""Create optimizer learning rate based on config.
Args:
learning_rate_config: A LearningRate proto message.
global_summaries: A set to attach learning rate summary to.
Returns:
A learning rate.
......@@ -81,7 +85,7 @@ def _create_learning_rate(learning_rate_config, global_summaries):
learning_rate_type = learning_rate_config.WhichOneof('learning_rate')
if learning_rate_type == 'constant_learning_rate':
config = learning_rate_config.constant_learning_rate
learning_rate = config.learning_rate
learning_rate = tf.constant(config.learning_rate, dtype=tf.float32)
if learning_rate_type == 'exponential_decay_learning_rate':
config = learning_rate_config.exponential_decay_learning_rate
......@@ -115,5 +119,4 @@ def _create_learning_rate(learning_rate_config, global_summaries):
if learning_rate is None:
raise ValueError('Learning_rate %s not supported.' % learning_rate_type)
global_summaries.add(tf.summary.scalar('Learning_Rate', learning_rate))
return learning_rate
......@@ -31,12 +31,13 @@ class LearningRateBuilderTest(tf.test.TestCase):
learning_rate: 0.004
}
"""
global_summaries = set([])
learning_rate_proto = optimizer_pb2.LearningRate()
text_format.Merge(learning_rate_text_proto, learning_rate_proto)
learning_rate = optimizer_builder._create_learning_rate(
learning_rate_proto, global_summaries)
self.assertAlmostEqual(learning_rate, 0.004)
learning_rate_proto)
with self.test_session():
learning_rate_out = learning_rate.eval()
self.assertAlmostEqual(learning_rate_out, 0.004)
def testBuildExponentialDecayLearningRate(self):
learning_rate_text_proto = """
......@@ -47,11 +48,10 @@ class LearningRateBuilderTest(tf.test.TestCase):
staircase: false
}
"""
global_summaries = set([])
learning_rate_proto = optimizer_pb2.LearningRate()
text_format.Merge(learning_rate_text_proto, learning_rate_proto)
learning_rate = optimizer_builder._create_learning_rate(
learning_rate_proto, global_summaries)
learning_rate_proto)
self.assertTrue(isinstance(learning_rate, tf.Tensor))
def testBuildManualStepLearningRate(self):
......@@ -67,11 +67,10 @@ class LearningRateBuilderTest(tf.test.TestCase):
}
}
"""
global_summaries = set([])
learning_rate_proto = optimizer_pb2.LearningRate()
text_format.Merge(learning_rate_text_proto, learning_rate_proto)
learning_rate = optimizer_builder._create_learning_rate(
learning_rate_proto, global_summaries)
learning_rate_proto)
self.assertTrue(isinstance(learning_rate, tf.Tensor))
def testBuildCosineDecayLearningRate(self):
......@@ -83,22 +82,19 @@ class LearningRateBuilderTest(tf.test.TestCase):
warmup_steps: 1000
}
"""
global_summaries = set([])
learning_rate_proto = optimizer_pb2.LearningRate()
text_format.Merge(learning_rate_text_proto, learning_rate_proto)
learning_rate = optimizer_builder._create_learning_rate(
learning_rate_proto, global_summaries)
learning_rate_proto)
self.assertTrue(isinstance(learning_rate, tf.Tensor))
def testRaiseErrorOnEmptyLearningRate(self):
learning_rate_text_proto = """
"""
global_summaries = set([])
learning_rate_proto = optimizer_pb2.LearningRate()
text_format.Merge(learning_rate_text_proto, learning_rate_proto)
with self.assertRaises(ValueError):
optimizer_builder._create_learning_rate(
learning_rate_proto, global_summaries)
optimizer_builder._create_learning_rate(learning_rate_proto)
class OptimizerBuilderTest(tf.test.TestCase):
......@@ -119,10 +115,9 @@ class OptimizerBuilderTest(tf.test.TestCase):
}
use_moving_average: false
"""
global_summaries = set([])
optimizer_proto = optimizer_pb2.Optimizer()
text_format.Merge(optimizer_text_proto, optimizer_proto)
optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
optimizer, _ = optimizer_builder.build(optimizer_proto)
self.assertTrue(isinstance(optimizer, tf.train.RMSPropOptimizer))
def testBuildMomentumOptimizer(self):
......@@ -137,10 +132,9 @@ class OptimizerBuilderTest(tf.test.TestCase):
}
use_moving_average: false
"""
global_summaries = set([])
optimizer_proto = optimizer_pb2.Optimizer()
text_format.Merge(optimizer_text_proto, optimizer_proto)
optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
optimizer, _ = optimizer_builder.build(optimizer_proto)
self.assertTrue(isinstance(optimizer, tf.train.MomentumOptimizer))
def testBuildAdamOptimizer(self):
......@@ -154,10 +148,9 @@ class OptimizerBuilderTest(tf.test.TestCase):
}
use_moving_average: false
"""
global_summaries = set([])
optimizer_proto = optimizer_pb2.Optimizer()
text_format.Merge(optimizer_text_proto, optimizer_proto)
optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
optimizer, _ = optimizer_builder.build(optimizer_proto)
self.assertTrue(isinstance(optimizer, tf.train.AdamOptimizer))
def testBuildMovingAverageOptimizer(self):
......@@ -171,10 +164,9 @@ class OptimizerBuilderTest(tf.test.TestCase):
}
use_moving_average: True
"""
global_summaries = set([])
optimizer_proto = optimizer_pb2.Optimizer()
text_format.Merge(optimizer_text_proto, optimizer_proto)
optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
optimizer, _ = optimizer_builder.build(optimizer_proto)
self.assertTrue(
isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
......@@ -190,23 +182,21 @@ class OptimizerBuilderTest(tf.test.TestCase):
use_moving_average: True
moving_average_decay: 0.2
"""
global_summaries = set([])
optimizer_proto = optimizer_pb2.Optimizer()
text_format.Merge(optimizer_text_proto, optimizer_proto)
optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
optimizer, _ = optimizer_builder.build(optimizer_proto)
self.assertTrue(
isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
# TODO(rathodv): Find a way to not depend on the private members.
# TODO: Find a way to not depend on the private members.
self.assertAlmostEqual(optimizer._ema._decay, 0.2)
def testBuildEmptyOptimizer(self):
optimizer_text_proto = """
"""
global_summaries = set([])
optimizer_proto = optimizer_pb2.Optimizer()
text_format.Merge(optimizer_text_proto, optimizer_proto)
with self.assertRaises(ValueError):
optimizer_builder.build(optimizer_proto, global_summaries)
optimizer_builder.build(optimizer_proto)
if __name__ == '__main__':
......
......@@ -83,6 +83,7 @@ PREPROCESSING_FUNCTION_MAP = {
'random_jitter_boxes': preprocessor.random_jitter_boxes,
'random_crop_to_aspect_ratio': preprocessor.random_crop_to_aspect_ratio,
'random_black_patches': preprocessor.random_black_patches,
'rgb_to_gray': preprocessor.rgb_to_gray,
'scale_boxes_to_pixel_coordinates': (
preprocessor.scale_boxes_to_pixel_coordinates),
'subtract_channel_mean': preprocessor.subtract_channel_mean,
......
......@@ -379,6 +379,16 @@ class PreprocessorBuilderTest(tf.test.TestCase):
'new_width': 100,
'method': tf.image.ResizeMethod.BICUBIC})
def test_build_rgb_to_gray(self):
preprocessor_text_proto = """
rgb_to_gray {}
"""
preprocessor_proto = preprocessor_pb2.PreprocessingStep()
text_format.Merge(preprocessor_text_proto, preprocessor_proto)
function, args = preprocessor_builder.build(preprocessor_proto)
self.assertEqual(function, preprocessor.rgb_to_gray)
self.assertEqual(args, {})
def test_build_subtract_channel_mean(self):
preprocessor_text_proto = """
subtract_channel_mean {
......
......@@ -53,7 +53,7 @@ py_library(
deps = [
":box_list",
"//tensorflow",
"//tensorflow_models/object_detection/utils:shape_utils",
"//tensorflow/models/research/object_detection/utils:shape_utils",
],
)
......@@ -113,7 +113,7 @@ py_library(
":box_list",
":box_list_ops",
"//tensorflow",
"//tensorflow_models/object_detection/utils:ops",
"//tensorflow/models/research/object_detection/utils:ops",
],
)
......@@ -123,6 +123,7 @@ py_library(
"matcher.py",
],
deps = [
"//tensorflow/models/research/object_detection/utils:ops",
],
)
......@@ -160,8 +161,17 @@ py_library(
":box_list",
":box_list_ops",
":keypoint_ops",
":preprocessor_cache",
":standard_fields",
"//tensorflow",
"//tensorflow/models/research/object_detection/utils:shape_utils",
],
)
py_library(
name = "preprocessor_cache",
srcs = [
"preprocessor_cache.py",
],
)
......@@ -172,6 +182,7 @@ py_test(
],
deps = [
":preprocessor",
":preprocessor_cache",
"//tensorflow",
],
)
......@@ -211,6 +222,7 @@ py_library(
":box_list_ops",
":standard_fields",
"//tensorflow",
"//tensorflow/models/research/object_detection/utils:shape_utils",
],
)
......@@ -232,15 +244,16 @@ py_library(
],
deps = [
":box_list",
":box_list_ops",
":matcher",
":region_similarity_calculator",
":standard_fields",
"//tensorflow",
"//tensorflow_models/object_detection/box_coders:faster_rcnn_box_coder",
"//tensorflow_models/object_detection/box_coders:mean_stddev_box_coder",
"//tensorflow_models/object_detection/core:box_coder",
"//tensorflow_models/object_detection/matchers:argmax_matcher",
"//tensorflow_models/object_detection/matchers:bipartite_matcher",
"//tensorflow/models/research/object_detection/box_coders:faster_rcnn_box_coder",
"//tensorflow/models/research/object_detection/box_coders:mean_stddev_box_coder",
"//tensorflow/models/research/object_detection/core:box_coder",
"//tensorflow/models/research/object_detection/matchers:argmax_matcher",
"//tensorflow/models/research/object_detection/matchers:bipartite_matcher",
"//tensorflow/models/research/object_detection/utils:shape_utils",
],
)
......@@ -254,8 +267,10 @@ py_test(
":region_similarity_calculator",
":target_assigner",
"//tensorflow",
"//tensorflow_models/object_detection/box_coders:mean_stddev_box_coder",
"//tensorflow_models/object_detection/matchers:bipartite_matcher",
"//tensorflow/models/research/object_detection/box_coders:keypoint_box_coder",
"//tensorflow/models/research/object_detection/box_coders:mean_stddev_box_coder",
"//tensorflow/models/research/object_detection/matchers:bipartite_matcher",
"//tensorflow/models/research/object_detection/utils:test_case",
],
)
......@@ -274,9 +289,9 @@ py_library(
srcs = ["box_predictor.py"],
deps = [
"//tensorflow",
"//tensorflow_models/object_detection/utils:ops",
"//tensorflow_models/object_detection/utils:shape_utils",
"//tensorflow_models/object_detection/utils:static_shape",
"//tensorflow/models/research/object_detection/utils:ops",
"//tensorflow/models/research/object_detection/utils:shape_utils",
"//tensorflow/models/research/object_detection/utils:static_shape",
],
)
......@@ -286,8 +301,9 @@ py_test(
deps = [
":box_predictor",
"//tensorflow",
"//tensorflow_models/object_detection/builders:hyperparams_builder",
"//tensorflow_models/object_detection/protos:hyperparams_py_pb2",
"//tensorflow/models/research/object_detection/builders:hyperparams_builder",
"//tensorflow/models/research/object_detection/protos:hyperparams_py_pb2",
"//tensorflow/models/research/object_detection/utils:test_case",
],
)
......@@ -298,7 +314,7 @@ py_library(
],
deps = [
"//tensorflow",
"//tensorflow_models/object_detection/core:box_list_ops",
"//tensorflow/models/research/object_detection/core:box_list_ops",
],
)
......@@ -309,7 +325,7 @@ py_test(
],
deps = [
":region_similarity_calculator",
"//tensorflow_models/object_detection/core:box_list",
"//tensorflow/models/research/object_detection/core:box_list",
],
)
......@@ -330,7 +346,7 @@ py_library(
],
deps = [
"//tensorflow",
"//tensorflow_models/object_detection/utils:ops",
"//tensorflow/models/research/object_detection/utils:ops",
],
)
......
......@@ -77,8 +77,8 @@ class AnchorGenerator(object):
def generate(self, feature_map_shape_list, **params):
"""Generates a collection of bounding boxes to be used as anchors.
TODO: remove **params from argument list and make stride and offsets (for
multiple_grid_anchor_generator) constructor arguments.
TODO: remove **params from argument list and make stride and
offsets (for multiple_grid_anchor_generator) constructor arguments.
Args:
feature_map_shape_list: list of (height, width) pairs in the format
......@@ -140,3 +140,4 @@ class AnchorGenerator(object):
* feature_map_shape[0]
* feature_map_shape[1])
return tf.assert_equal(expected_num_anchors, anchors.num_boxes())
......@@ -183,7 +183,8 @@ def prune_completely_outside_window(boxlist, window, scope=None):
scope: name scope.
Returns:
pruned_corners: a tensor with shape [M_out, 4] where M_out <= M_in
pruned_boxlist: a new BoxList with all bounding boxes partially or fully in
the window.
valid_indices: a tensor with shape [M_out] indexing the valid bounding boxes
in the input tensor.
"""
......@@ -982,3 +983,79 @@ def pad_or_clip_box_list(boxlist, num_boxes, scope=None):
boxlist.get_field(field), num_boxes)
subboxlist.add_field(field, subfield)
return subboxlist
def select_random_box(boxlist,
default_box=None,
seed=None,
scope=None):
"""Selects a random bounding box from a `BoxList`.
Args:
boxlist: A BoxList.
default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`,
this default box will be returned. If None, will use a default box of
[[-1., -1., -1., -1.]].
seed: Random seed.
scope: Name scope.
Returns:
bbox: A [1, 4] tensor with a random bounding box.
valid: A bool tensor indicating whether a valid bounding box is returned
(True) or whether the default box is returned (False).
"""
with tf.name_scope(scope, 'SelectRandomBox'):
bboxes = boxlist.get()
combined_shape = shape_utils.combined_static_and_dynamic_shape(bboxes)
number_of_boxes = combined_shape[0]
default_box = default_box or tf.constant([[-1., -1., -1., -1.]])
def select_box():
random_index = tf.random_uniform([],
maxval=number_of_boxes,
dtype=tf.int32,
seed=seed)
return tf.expand_dims(bboxes[random_index], axis=0), tf.constant(True)
return tf.cond(
tf.greater_equal(number_of_boxes, 1),
true_fn=select_box,
false_fn=lambda: (default_box, tf.constant(False)))
def get_minimal_coverage_box(boxlist,
default_box=None,
scope=None):
"""Creates a single bounding box which covers all boxes in the boxlist.
Args:
boxlist: A Boxlist.
default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`,
this default box will be returned. If None, will use a default box of
[[0., 0., 1., 1.]].
scope: Name scope.
Returns:
A [1, 4] float32 tensor with a bounding box that tightly covers all the
boxes in the box list. If the boxlist does not contain any boxes, the
default box is returned.
"""
with tf.name_scope(scope, 'CreateCoverageBox'):
num_boxes = boxlist.num_boxes()
def coverage_box(bboxes):
y_min, x_min, y_max, x_max = tf.split(
value=bboxes, num_or_size_splits=4, axis=1)
y_min_coverage = tf.reduce_min(y_min, axis=0)
x_min_coverage = tf.reduce_min(x_min, axis=0)
y_max_coverage = tf.reduce_max(y_max, axis=0)
x_max_coverage = tf.reduce_max(x_max, axis=0)
return tf.stack(
[y_min_coverage, x_min_coverage, y_max_coverage, x_max_coverage],
axis=1)
default_box = default_box or tf.constant([[0., 0., 1., 1.]])
return tf.cond(
tf.greater_equal(num_boxes, 1),
true_fn=lambda: coverage_box(boxlist.get()),
false_fn=lambda: default_box)
......@@ -153,6 +153,25 @@ class BoxListOpsTest(tf.test.TestCase):
extra_data_out = sess.run(pruned.get_field('extra_data'))
self.assertAllEqual(extra_data_out, [[1], [2], [3], [4], [6]])
def test_prune_completely_outside_window_with_empty_boxlist(self):
window = tf.constant([0, 0, 9, 14], tf.float32)
corners = tf.zeros(shape=[0, 4], dtype=tf.float32)
boxes = box_list.BoxList(corners)
boxes.add_field('extra_data', tf.zeros(shape=[0], dtype=tf.int32))
pruned, keep_indices = box_list_ops.prune_completely_outside_window(boxes,
window)
pruned_boxes = pruned.get()
extra = pruned.get_field('extra_data')
exp_pruned_boxes = np.zeros(shape=[0, 4], dtype=np.float32)
exp_extra = np.zeros(shape=[0], dtype=np.int32)
with self.test_session() as sess:
pruned_boxes_out, keep_indices_out, extra_out = sess.run(
[pruned_boxes, keep_indices, extra])
self.assertAllClose(exp_pruned_boxes, pruned_boxes_out)
self.assertAllEqual([], keep_indices_out)
self.assertAllEqual(exp_extra, extra_out)
def test_intersection(self):
corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]])
corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0],
......@@ -593,6 +612,58 @@ class BoxListOpsTest(tf.test.TestCase):
self.assertAllEqual(expected_classes, classes_out)
self.assertAllClose(expected_scores, scores_out)
def test_select_random_box(self):
boxes = [[0., 0., 1., 1.],
[0., 1., 2., 3.],
[0., 2., 3., 4.]]
corners = tf.constant(boxes, dtype=tf.float32)
boxlist = box_list.BoxList(corners)
random_bbox, valid = box_list_ops.select_random_box(boxlist)
with self.test_session() as sess:
random_bbox_out, valid_out = sess.run([random_bbox, valid])
norm_small = any(
[np.linalg.norm(random_bbox_out - box) < 1e-6 for box in boxes])
self.assertTrue(norm_small)
self.assertTrue(valid_out)
def test_select_random_box_with_empty_boxlist(self):
corners = tf.constant([], shape=[0, 4], dtype=tf.float32)
boxlist = box_list.BoxList(corners)
random_bbox, valid = box_list_ops.select_random_box(boxlist)
with self.test_session() as sess:
random_bbox_out, valid_out = sess.run([random_bbox, valid])
expected_bbox_out = np.array([[-1., -1., -1., -1.]], dtype=np.float32)
self.assertAllEqual(expected_bbox_out, random_bbox_out)
self.assertFalse(valid_out)
def test_get_minimal_coverage_box(self):
boxes = [[0., 0., 1., 1.],
[-1., 1., 2., 3.],
[0., 2., 3., 4.]]
expected_coverage_box = [[-1., 0., 3., 4.]]
corners = tf.constant(boxes, dtype=tf.float32)
boxlist = box_list.BoxList(corners)
coverage_box = box_list_ops.get_minimal_coverage_box(boxlist)
with self.test_session() as sess:
coverage_box_out = sess.run(coverage_box)
self.assertAllClose(expected_coverage_box, coverage_box_out)
def test_get_minimal_coverage_box_with_empty_boxlist(self):
corners = tf.constant([], shape=[0, 4], dtype=tf.float32)
boxlist = box_list.BoxList(corners)
coverage_box = box_list_ops.get_minimal_coverage_box(boxlist)
with self.test_session() as sess:
coverage_box_out = sess.run(coverage_box)
self.assertAllClose([[0.0, 0.0, 1.0, 1.0]], coverage_box_out)
class ConcatenateTest(tf.test.TestCase):
......@@ -958,5 +1029,6 @@ class BoxRefinementTest(tf.test.TestCase):
self.assertAllClose(expected_scores, scores_out)
self.assertAllEqual(extra_field_out, [0, 1, 1])
if __name__ == '__main__':
tf.test.main()
......@@ -27,6 +27,7 @@ These modules are separated from the main model since the same
few box predictor architectures are shared across many models.
"""
from abc import abstractmethod
import math
import tensorflow as tf
from object_detection.utils import ops
from object_detection.utils import shape_utils
......@@ -59,8 +60,8 @@ class BoxPredictor(object):
def num_classes(self):
return self._num_classes
def predict(self, image_features, num_predictions_per_location, scope,
**params):
def predict(self, image_features, num_predictions_per_location,
scope=None, **params):
"""Computes encoded object locations and corresponding confidences.
Takes a high level image feature map as input and produce two predictions,
......@@ -70,10 +71,10 @@ class BoxPredictor(object):
and do not assume anything about their shapes.
Args:
image_features: A float tensor of shape [batch_size, height, width,
channels] containing features for a batch of images.
num_predictions_per_location: an integer representing the number of box
predictions to be made per spatial location in the feature map.
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels_i] containing features for a batch of images.
num_predictions_per_location: A list of integers representing the number
of box predictions to be made per spatial location for each feature map.
scope: Variable and Op scope name.
**params: Additional keyword arguments for specific implementations of
BoxPredictor.
......@@ -86,10 +87,22 @@ class BoxPredictor(object):
class_predictions_with_background: A float tensor of shape
[batch_size, num_anchors, num_classes + 1] representing the class
predictions for the proposals.
Raises:
ValueError: If length of `image_features` is not equal to length of
`num_predictions_per_location`.
"""
with tf.variable_scope(scope):
return self._predict(image_features, num_predictions_per_location,
**params)
if len(image_features) != len(num_predictions_per_location):
raise ValueError('image_feature and num_predictions_per_location must '
'be of same length, found: {} vs {}'.
format(len(image_features),
len(num_predictions_per_location)))
if scope is not None:
with tf.variable_scope(scope):
return self._predict(image_features, num_predictions_per_location,
**params)
return self._predict(image_features, num_predictions_per_location,
**params)
# TODO: num_predictions_per_location could be moved to constructor.
# This is currently only used by ConvolutionalBoxPredictor.
......@@ -98,10 +111,10 @@ class BoxPredictor(object):
"""Implementations must override this method.
Args:
image_features: A float tensor of shape [batch_size, height, width,
channels] containing features for a batch of images.
num_predictions_per_location: an integer representing the number of box
predictions to be made per spatial location in the feature map.
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels_i] containing features for a batch of images.
num_predictions_per_location: A list of integers representing the number
of box predictions to be made per spatial location for each feature map.
**params: Additional keyword arguments for specific implementations of
BoxPredictor.
......@@ -169,28 +182,35 @@ class RfcnBoxPredictor(BoxPredictor):
"""Computes encoded object locations and corresponding confidences.
Args:
image_features: A float tensor of shape [batch_size, height, width,
channels] containing features for a batch of images.
num_predictions_per_location: an integer representing the number of box
predictions to be made per spatial location in the feature map.
Currently, this must be set to 1, or an error will be raised.
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels_i] containing features for a batch of images.
num_predictions_per_location: A list of integers representing the number
of box predictions to be made per spatial location for each feature map.
Currently, this must be set to [1], or an error will be raised.
proposal_boxes: A float tensor of shape [batch_size, num_proposals,
box_code_size].
Returns:
box_encodings: A float tensor of shape
[batch_size, 1, num_classes, code_size] representing the
[batch_size, num_anchors, num_classes, code_size] representing the
location of the objects.
class_predictions_with_background: A float tensor of shape
[batch_size, 1, num_classes + 1] representing the class
[batch_size, num_anchors, num_classes + 1] representing the class
predictions for the proposals.
Raises:
ValueError: if num_predictions_per_location is not 1.
ValueError: if num_predictions_per_location is not 1 or if
len(image_features) is not 1.
"""
if num_predictions_per_location != 1:
if (len(num_predictions_per_location) != 1 or
num_predictions_per_location[0] != 1):
raise ValueError('Currently RfcnBoxPredictor only supports '
'predicting a single box per class per location.')
if len(image_features) != 1:
raise ValueError('length of `image_features` must be 1. Found {}'.
format(len(image_features)))
image_feature = image_features[0]
num_predictions_per_location = num_predictions_per_location[0]
batch_size = tf.shape(proposal_boxes)[0]
num_boxes = tf.shape(proposal_boxes)[1]
def get_box_indices(proposals):
......@@ -202,7 +222,7 @@ class RfcnBoxPredictor(BoxPredictor):
tf.range(start=0, limit=proposals_shape[0]), 1)
return tf.reshape(ones_mat * multiplier, [-1])
net = image_features
net = image_feature
with slim.arg_scope(self._conv_hyperparams):
net = slim.conv2d(net, self._depth, [1, 1], scope='reduce_depth')
# Location predictions.
......@@ -280,6 +300,7 @@ class MaskRCNNBoxPredictor(BoxPredictor):
predict_instance_masks=False,
mask_height=14,
mask_width=14,
mask_prediction_num_conv_layers=2,
mask_prediction_conv_depth=256,
predict_keypoints=False):
"""Constructor.
......@@ -304,13 +325,21 @@ class MaskRCNNBoxPredictor(BoxPredictor):
boxes.
mask_height: Desired output mask height. The default value is 14.
mask_width: Desired output mask width. The default value is 14.
mask_prediction_num_conv_layers: Number of convolution layers applied to
the image_features in mask prediction branch.
mask_prediction_conv_depth: The depth for the first conv2d_transpose op
applied to the image_features in the mask prediciton branch.
applied to the image_features in the mask prediction branch. If set
to 0, the depth of the convolution layers will be automatically chosen
based on the number of object classes and the number of channels in the
image features.
predict_keypoints: Whether to predict keypoints insde detection boxes.
Raises:
ValueError: If predict_instance_masks or predict_keypoints is true.
ValueError: If predict_instance_masks is true but conv_hyperparams is not
set.
ValueError: If predict_keypoints is true since it is not implemented yet.
ValueError: If mask_prediction_num_conv_layers is smaller than two.
"""
super(MaskRCNNBoxPredictor, self).__init__(is_training, num_classes)
self._fc_hyperparams = fc_hyperparams
......@@ -321,6 +350,7 @@ class MaskRCNNBoxPredictor(BoxPredictor):
self._predict_instance_masks = predict_instance_masks
self._mask_height = mask_height
self._mask_width = mask_width
self._mask_prediction_num_conv_layers = mask_prediction_num_conv_layers
self._mask_prediction_conv_depth = mask_prediction_conv_depth
self._predict_keypoints = predict_keypoints
if self._predict_keypoints:
......@@ -329,52 +359,33 @@ class MaskRCNNBoxPredictor(BoxPredictor):
self._conv_hyperparams is None):
raise ValueError('`conv_hyperparams` must be provided when predicting '
'masks.')
if self._mask_prediction_num_conv_layers < 2:
raise ValueError(
'Mask prediction should consist of at least 2 conv layers')
@property
def num_classes(self):
return self._num_classes
def _predict(self, image_features, num_predictions_per_location):
"""Computes encoded object locations and corresponding confidences.
Flattens image_features and applies fully connected ops (with no
non-linearity) to predict box encodings and class predictions. In this
setting, anchors are not spatially arranged in any way and are assumed to
have been folded into the batch dimension. Thus we output 1 for the
anchors dimension.
@property
def predicts_instance_masks(self):
return self._predict_instance_masks
Also optionally predicts instance masks.
The mask prediction head is based on the Mask RCNN paper with the following
modifications: We replace the deconvolution layer with a bilinear resize
and a convolution.
def _predict_boxes_and_classes(self, image_features):
"""Predicts boxes and class scores.
Args:
image_features: A float tensor of shape [batch_size, height, width,
channels] containing features for a batch of images.
num_predictions_per_location: an integer representing the number of box
predictions to be made per spatial location in the feature map.
Currently, this must be set to 1, or an error will be raised.
Returns:
A dictionary containing the following tensors.
box_encodings: A float tensor of shape
[batch_size, 1, num_classes, code_size] representing the
location of the objects.
class_predictions_with_background: A float tensor of shape
[batch_size, 1, num_classes + 1] representing the class
predictions for the proposals.
If predict_masks is True the dictionary also contains:
instance_masks: A float tensor of shape
[batch_size, 1, num_classes, image_height, image_width]
If predict_keypoints is True the dictionary also contains:
keypoints: [batch_size, 1, num_keypoints, 2]
Raises:
ValueError: if num_predictions_per_location is not 1.
box_encodings: A float tensor of shape
[batch_size, 1, num_classes, code_size] representing the location of the
objects.
class_predictions_with_background: A float tensor of shape
[batch_size, 1, num_classes + 1] representing the class predictions for
the proposals.
"""
if num_predictions_per_location != 1:
raise ValueError('Currently FullyConnectedBoxPredictor only supports '
'predicting a single box per class per location.')
spatial_averaged_image_features = tf.reduce_mean(image_features, [1, 2],
keep_dims=True,
name='AvgPool')
......@@ -398,34 +409,155 @@ class MaskRCNNBoxPredictor(BoxPredictor):
box_encodings, [-1, 1, self._num_classes, self._box_code_size])
class_predictions_with_background = tf.reshape(
class_predictions_with_background, [-1, 1, self._num_classes + 1])
return box_encodings, class_predictions_with_background
def _get_mask_predictor_conv_depth(self, num_feature_channels, num_classes,
class_weight=3.0, feature_weight=2.0):
"""Computes the depth of the mask predictor convolutions.
Computes the depth of the mask predictor convolutions given feature channels
and number of classes by performing a weighted average of the two in
log space to compute the number of convolution channels. The weights that
are used for computing the weighted average do not need to sum to 1.
Args:
num_feature_channels: An integer containing the number of feature
channels.
num_classes: An integer containing the number of classes.
class_weight: Class weight used in computing the weighted average.
feature_weight: Feature weight used in computing the weighted average.
predictions_dict = {
BOX_ENCODINGS: box_encodings,
CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background
}
if self._predict_instance_masks:
with slim.arg_scope(self._conv_hyperparams):
upsampled_features = tf.image.resize_bilinear(
image_features,
[self._mask_height, self._mask_width],
align_corners=True)
Returns:
An integer containing the number of convolution channels used by mask
predictor.
"""
num_feature_channels_log = math.log(float(num_feature_channels), 2.0)
num_classes_log = math.log(float(num_classes), 2.0)
weighted_num_feature_channels_log = (
num_feature_channels_log * feature_weight)
weighted_num_classes_log = num_classes_log * class_weight
total_weight = feature_weight + class_weight
num_conv_channels_log = round(
(weighted_num_feature_channels_log + weighted_num_classes_log) /
total_weight)
return int(math.pow(2.0, num_conv_channels_log))
def _predict_masks(self, image_features):
"""Performs mask prediction.
Args:
image_features: A float tensor of shape [batch_size, height, width,
channels] containing features for a batch of images.
Returns:
instance_masks: A float tensor of shape
[batch_size, 1, num_classes, image_height, image_width].
"""
num_conv_channels = self._mask_prediction_conv_depth
if num_conv_channels == 0:
num_feature_channels = image_features.get_shape().as_list()[3]
num_conv_channels = self._get_mask_predictor_conv_depth(
num_feature_channels, self.num_classes)
with slim.arg_scope(self._conv_hyperparams):
upsampled_features = tf.image.resize_bilinear(
image_features,
[self._mask_height, self._mask_width],
align_corners=True)
for _ in range(self._mask_prediction_num_conv_layers - 1):
upsampled_features = slim.conv2d(
upsampled_features,
num_outputs=self._mask_prediction_conv_depth,
kernel_size=[2, 2])
mask_predictions = slim.conv2d(upsampled_features,
num_outputs=self.num_classes,
activation_fn=None,
kernel_size=[3, 3])
instance_masks = tf.expand_dims(tf.transpose(mask_predictions,
perm=[0, 3, 1, 2]),
axis=1,
name='MaskPredictor')
predictions_dict[MASK_PREDICTIONS] = instance_masks
num_outputs=num_conv_channels,
kernel_size=[3, 3])
mask_predictions = slim.conv2d(upsampled_features,
num_outputs=self.num_classes,
activation_fn=None,
kernel_size=[3, 3])
return tf.expand_dims(
tf.transpose(mask_predictions, perm=[0, 3, 1, 2]),
axis=1,
name='MaskPredictor')
def _predict(self, image_features, num_predictions_per_location,
predict_boxes_and_classes=True, predict_auxiliary_outputs=False):
"""Optionally computes encoded object locations, confidences, and masks.
Flattens image_features and applies fully connected ops (with no
non-linearity) to predict box encodings and class predictions. In this
setting, anchors are not spatially arranged in any way and are assumed to
have been folded into the batch dimension. Thus we output 1 for the
anchors dimension.
Also optionally predicts instance masks.
The mask prediction head is based on the Mask RCNN paper with the following
modifications: We replace the deconvolution layer with a bilinear resize
and a convolution.
Args:
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels_i] containing features for a batch of images.
num_predictions_per_location: A list of integers representing the number
of box predictions to be made per spatial location for each feature map.
Currently, this must be set to [1], or an error will be raised.
predict_boxes_and_classes: If true, the function will perform box
refinement and classification.
predict_auxiliary_outputs: If true, the function will perform other
predictions such as mask, keypoint, boundaries, etc. if any.
Returns:
A dictionary containing the following tensors.
box_encodings: A float tensor of shape
[batch_size, 1, num_classes, code_size] representing the
location of the objects.
class_predictions_with_background: A float tensor of shape
[batch_size, 1, num_classes + 1] representing the class
predictions for the proposals.
If predict_masks is True the dictionary also contains:
instance_masks: A float tensor of shape
[batch_size, 1, num_classes, image_height, image_width]
If predict_keypoints is True the dictionary also contains:
keypoints: [batch_size, 1, num_keypoints, 2]
Raises:
ValueError: If num_predictions_per_location is not 1 or if both
predict_boxes_and_classes and predict_auxiliary_outputs are false or if
len(image_features) is not 1.
"""
if (len(num_predictions_per_location) != 1 or
num_predictions_per_location[0] != 1):
raise ValueError('Currently FullyConnectedBoxPredictor only supports '
'predicting a single box per class per location.')
if not predict_boxes_and_classes and not predict_auxiliary_outputs:
raise ValueError('Should perform at least one prediction.')
if len(image_features) != 1:
raise ValueError('length of `image_features` must be 1. Found {}'.
format(len(image_features)))
image_feature = image_features[0]
num_predictions_per_location = num_predictions_per_location[0]
predictions_dict = {}
if predict_boxes_and_classes:
(box_encodings, class_predictions_with_background
) = self._predict_boxes_and_classes(image_feature)
predictions_dict[BOX_ENCODINGS] = box_encodings
predictions_dict[
CLASS_PREDICTIONS_WITH_BACKGROUND] = class_predictions_with_background
if self._predict_instance_masks and predict_auxiliary_outputs:
predictions_dict[MASK_PREDICTIONS] = self._predict_masks(image_feature)
return predictions_dict
class _NoopVariableScope(object):
"""A dummy class that does not push any scope."""
def __enter__(self):
return None
def __exit__(self, exc_type, exc_value, traceback):
return False
class ConvolutionalBoxPredictor(BoxPredictor):
"""Convolutional Box Predictor.
......@@ -450,7 +582,8 @@ class ConvolutionalBoxPredictor(BoxPredictor):
kernel_size,
box_code_size,
apply_sigmoid_to_scores=False,
class_prediction_bias_init=0.0):
class_prediction_bias_init=0.0,
use_depthwise=False):
"""Constructor.
Args:
......@@ -479,6 +612,8 @@ class ConvolutionalBoxPredictor(BoxPredictor):
class_predictions.
class_prediction_bias_init: constant value to initialize bias of the last
conv2d layer before class prediction.
use_depthwise: Whether to use depthwise convolutions for prediction
steps. Default is False.
Raises:
ValueError: if min_depth > max_depth.
......@@ -496,15 +631,17 @@ class ConvolutionalBoxPredictor(BoxPredictor):
self._dropout_keep_prob = dropout_keep_prob
self._apply_sigmoid_to_scores = apply_sigmoid_to_scores
self._class_prediction_bias_init = class_prediction_bias_init
self._use_depthwise = use_depthwise
def _predict(self, image_features, num_predictions_per_location):
def _predict(self, image_features, num_predictions_per_location_list):
"""Computes encoded object locations and corresponding confidences.
Args:
image_features: A float tensor of shape [batch_size, height, width,
channels] containing features for a batch of images.
num_predictions_per_location: an integer representing the number of box
predictions to be made per spatial location in the feature map.
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels_i] containing features for a batch of images.
num_predictions_per_location_list: A list of integers representing the
number of box predictions to be made per spatial location for each
feature map.
Returns:
A dictionary containing the following tensors.
......@@ -514,53 +651,245 @@ class ConvolutionalBoxPredictor(BoxPredictor):
class_predictions_with_background: A float tensor of shape
[batch_size, num_anchors, num_classes + 1] representing the class
predictions for the proposals.
"""
# Add a slot for the background class.
num_class_slots = self.num_classes + 1
net = image_features
with slim.arg_scope(self._conv_hyperparams), \
slim.arg_scope([slim.dropout], is_training=self._is_training):
# Add additional conv layers before the class predictor.
features_depth = static_shape.get_depth(image_features.get_shape())
depth = max(min(features_depth, self._max_depth), self._min_depth)
tf.logging.info('depth of additional conv before box predictor: {}'.
format(depth))
if depth > 0 and self._num_layers_before_predictor > 0:
for i in range(self._num_layers_before_predictor):
net = slim.conv2d(
net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
with slim.arg_scope([slim.conv2d], activation_fn=None,
normalizer_fn=None, normalizer_params=None):
box_encodings = slim.conv2d(
net, num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
scope='BoxEncodingPredictor')
if self._use_dropout:
net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
class_predictions_with_background = slim.conv2d(
net, num_predictions_per_location * num_class_slots,
[self._kernel_size, self._kernel_size], scope='ClassPredictor',
biases_initializer=tf.constant_initializer(
self._class_prediction_bias_init))
if self._apply_sigmoid_to_scores:
class_predictions_with_background = tf.sigmoid(
class_predictions_with_background)
combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
image_features)
box_encodings = tf.reshape(
box_encodings, tf.stack([combined_feature_map_shape[0],
combined_feature_map_shape[1] *
combined_feature_map_shape[2] *
num_predictions_per_location,
1, self._box_code_size]))
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
tf.stack([combined_feature_map_shape[0],
combined_feature_map_shape[1] *
combined_feature_map_shape[2] *
num_predictions_per_location,
num_class_slots]))
return {BOX_ENCODINGS: box_encodings,
box_encodings_list = []
class_predictions_list = []
# TODO: Come up with a better way to generate scope names
# in box predictor once we have time to retrain all models in the zoo.
# The following lines create scope names to be backwards compatible with the
# existing checkpoints.
box_predictor_scopes = [_NoopVariableScope()]
if len(image_features) > 1:
box_predictor_scopes = [
tf.variable_scope('BoxPredictor_{}'.format(i))
for i in range(len(image_features))
]
for (image_feature,
num_predictions_per_location, box_predictor_scope) in zip(
image_features, num_predictions_per_location_list,
box_predictor_scopes):
with box_predictor_scope:
# Add a slot for the background class.
num_class_slots = self.num_classes + 1
net = image_feature
with slim.arg_scope(self._conv_hyperparams), \
slim.arg_scope([slim.dropout], is_training=self._is_training):
# Add additional conv layers before the class predictor.
features_depth = static_shape.get_depth(image_feature.get_shape())
depth = max(min(features_depth, self._max_depth), self._min_depth)
tf.logging.info('depth of additional conv before box predictor: {}'.
format(depth))
if depth > 0 and self._num_layers_before_predictor > 0:
for i in range(self._num_layers_before_predictor):
net = slim.conv2d(
net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
with slim.arg_scope([slim.conv2d], activation_fn=None,
normalizer_fn=None, normalizer_params=None):
if self._use_depthwise:
box_encodings = slim.separable_conv2d(
net, None, [self._kernel_size, self._kernel_size],
padding='SAME', depth_multiplier=1, stride=1,
rate=1, scope='BoxEncodingPredictor_depthwise')
box_encodings = slim.conv2d(
box_encodings,
num_predictions_per_location * self._box_code_size, [1, 1],
scope='BoxEncodingPredictor')
else:
box_encodings = slim.conv2d(
net, num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
scope='BoxEncodingPredictor')
if self._use_dropout:
net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
if self._use_depthwise:
class_predictions_with_background = slim.separable_conv2d(
net, None, [self._kernel_size, self._kernel_size],
padding='SAME', depth_multiplier=1, stride=1,
rate=1, scope='ClassPredictor_depthwise')
class_predictions_with_background = slim.conv2d(
class_predictions_with_background,
num_predictions_per_location * num_class_slots,
[1, 1], scope='ClassPredictor')
else:
class_predictions_with_background = slim.conv2d(
net, num_predictions_per_location * num_class_slots,
[self._kernel_size, self._kernel_size],
scope='ClassPredictor',
biases_initializer=tf.constant_initializer(
self._class_prediction_bias_init))
if self._apply_sigmoid_to_scores:
class_predictions_with_background = tf.sigmoid(
class_predictions_with_background)
combined_feature_map_shape = (shape_utils.
combined_static_and_dynamic_shape(
image_feature))
box_encodings = tf.reshape(
box_encodings, tf.stack([combined_feature_map_shape[0],
combined_feature_map_shape[1] *
combined_feature_map_shape[2] *
num_predictions_per_location,
1, self._box_code_size]))
box_encodings_list.append(box_encodings)
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
tf.stack([combined_feature_map_shape[0],
combined_feature_map_shape[1] *
combined_feature_map_shape[2] *
num_predictions_per_location,
num_class_slots]))
class_predictions_list.append(class_predictions_with_background)
return {BOX_ENCODINGS: tf.concat(box_encodings_list, axis=1),
CLASS_PREDICTIONS_WITH_BACKGROUND:
class_predictions_with_background}
tf.concat(class_predictions_list, axis=1)}
# TODO: Merge the implementation with ConvolutionalBoxPredictor above
# since they are very similar.
class WeightSharedConvolutionalBoxPredictor(BoxPredictor):
"""Convolutional Box Predictor with weight sharing.
Defines the box predictor as defined in
https://arxiv.org/abs/1708.02002. This class differs from
ConvolutionalBoxPredictor in that it shares weights and biases while
predicting from different feature maps. Separate multi-layer towers are
constructed for the box encoding and class predictors respectively.
"""
def __init__(self,
is_training,
num_classes,
conv_hyperparams,
depth,
num_layers_before_predictor,
box_code_size,
kernel_size=3,
class_prediction_bias_init=0.0):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: number of classes. Note that num_classes *does not*
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
conv_hyperparams: Slim arg_scope with hyperparameters for convolution ops.
depth: depth of conv layers.
num_layers_before_predictor: Number of the additional conv layers before
the predictor.
box_code_size: Size of encoding for each box.
kernel_size: Size of final convolution kernel.
class_prediction_bias_init: constant value to initialize bias of the last
conv2d layer before class prediction.
"""
super(WeightSharedConvolutionalBoxPredictor, self).__init__(is_training,
num_classes)
self._conv_hyperparams = conv_hyperparams
self._depth = depth
self._num_layers_before_predictor = num_layers_before_predictor
self._box_code_size = box_code_size
self._kernel_size = kernel_size
self._class_prediction_bias_init = class_prediction_bias_init
def _predict(self, image_features, num_predictions_per_location_list):
"""Computes encoded object locations and corresponding confidences.
Args:
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels] containing features for a batch of images. Note that
all tensors in the list must have the same number of channels.
num_predictions_per_location_list: A list of integers representing the
number of box predictions to be made per spatial location for each
feature map. Note that all values must be the same since the weights are
shared.
Returns:
A dictionary containing the following tensors.
box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
code_size] representing the location of the objects, where
num_anchors = feat_height * feat_width * num_predictions_per_location
class_predictions_with_background: A float tensor of shape
[batch_size, num_anchors, num_classes + 1] representing the class
predictions for the proposals.
Raises:
ValueError: If the image feature maps do not have the same number of
channels or if the num predictions per locations is differs between the
feature maps.
"""
if len(set(num_predictions_per_location_list)) > 1:
raise ValueError('num predictions per location must be same for all'
'feature maps, found: {}'.format(
num_predictions_per_location_list))
feature_channels = [
image_feature.shape[3].value for image_feature in image_features
]
if len(set(feature_channels)) > 1:
raise ValueError('all feature maps must have the same number of '
'channels, found: {}'.format(feature_channels))
box_encodings_list = []
class_predictions_list = []
for (image_feature, num_predictions_per_location) in zip(
image_features, num_predictions_per_location_list):
# Add a slot for the background class.
with tf.variable_scope('WeightSharedConvolutionalBoxPredictor',
reuse=tf.AUTO_REUSE):
num_class_slots = self.num_classes + 1
box_encodings_net = image_feature
class_predictions_net = image_feature
with slim.arg_scope(self._conv_hyperparams):
for i in range(self._num_layers_before_predictor):
box_encodings_net = slim.conv2d(
box_encodings_net,
self._depth,
[self._kernel_size, self._kernel_size],
stride=1,
padding='SAME',
scope='BoxEncodingPredictionTower/conv2d_{}'.format(i))
box_encodings = slim.conv2d(
box_encodings_net,
num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
activation_fn=None, stride=1, padding='SAME',
scope='BoxEncodingPredictor')
for i in range(self._num_layers_before_predictor):
class_predictions_net = slim.conv2d(
class_predictions_net,
self._depth,
[self._kernel_size, self._kernel_size],
stride=1,
padding='SAME',
scope='ClassPredictionTower/conv2d_{}'.format(i))
class_predictions_with_background = slim.conv2d(
class_predictions_net,
num_predictions_per_location * num_class_slots,
[self._kernel_size, self._kernel_size],
activation_fn=None, stride=1, padding='SAME',
biases_initializer=tf.constant_initializer(
self._class_prediction_bias_init),
scope='ClassPredictor')
combined_feature_map_shape = (shape_utils.
combined_static_and_dynamic_shape(
image_feature))
box_encodings = tf.reshape(
box_encodings, tf.stack([combined_feature_map_shape[0],
combined_feature_map_shape[1] *
combined_feature_map_shape[2] *
num_predictions_per_location,
1, self._box_code_size]))
box_encodings_list.append(box_encodings)
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
tf.stack([combined_feature_map_shape[0],
combined_feature_map_shape[1] *
combined_feature_map_shape[2] *
num_predictions_per_location,
num_class_slots]))
class_predictions_list.append(class_predictions_with_background)
return {BOX_ENCODINGS: tf.concat(box_encodings_list, axis=1),
CLASS_PREDICTIONS_WITH_BACKGROUND:
tf.concat(class_predictions_list, axis=1)}
......@@ -14,7 +14,6 @@
# ==============================================================================
"""Tests for object_detection.core.box_predictor."""
import numpy as np
import tensorflow as tf
......@@ -22,6 +21,7 @@ from google.protobuf import text_format
from object_detection.builders import hyperparams_builder
from object_detection.core import box_predictor
from object_detection.protos import hyperparams_pb2
from object_detection.utils import test_case
class MaskRCNNBoxPredictorTest(tf.test.TestCase):
......@@ -55,7 +55,8 @@ class MaskRCNNBoxPredictorTest(tf.test.TestCase):
box_code_size=4,
)
box_predictions = mask_box_predictor.predict(
image_features, num_predictions_per_location=1, scope='BoxPredictor')
[image_features], num_predictions_per_location=[1],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
class_predictions_with_background = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
......@@ -93,12 +94,16 @@ class MaskRCNNBoxPredictorTest(tf.test.TestCase):
op_type=hyperparams_pb2.Hyperparams.CONV),
predict_instance_masks=True)
box_predictions = mask_box_predictor.predict(
image_features, num_predictions_per_location=1, scope='BoxPredictor')
[image_features],
num_predictions_per_location=[1],
scope='BoxPredictor',
predict_boxes_and_classes=True,
predict_auxiliary_outputs=True)
mask_predictions = box_predictions[box_predictor.MASK_PREDICTIONS]
self.assertListEqual([2, 1, 5, 14, 14],
mask_predictions.get_shape().as_list())
def test_do_not_return_instance_masks_and_keypoints_without_request(self):
def test_do_not_return_instance_masks_without_request(self):
image_features = tf.random_uniform([2, 7, 7, 3], dtype=tf.float32)
mask_box_predictor = box_predictor.MaskRCNNBoxPredictor(
is_training=False,
......@@ -108,7 +113,8 @@ class MaskRCNNBoxPredictorTest(tf.test.TestCase):
dropout_keep_prob=0.5,
box_code_size=4)
box_predictions = mask_box_predictor.predict(
image_features, num_predictions_per_location=1, scope='BoxPredictor')
[image_features], num_predictions_per_location=[1],
scope='BoxPredictor')
self.assertEqual(len(box_predictions), 2)
self.assertTrue(box_predictor.BOX_ENCODINGS in box_predictions)
self.assertTrue(box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND
......@@ -156,7 +162,8 @@ class RfcnBoxPredictorTest(tf.test.TestCase):
box_code_size=4
)
box_predictions = rfcn_box_predictor.predict(
image_features, num_predictions_per_location=1, scope='BoxPredictor',
[image_features], num_predictions_per_location=[1],
scope='BoxPredictor',
proposal_boxes=proposal_boxes)
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
class_predictions_with_background = box_predictions[
......@@ -173,7 +180,7 @@ class RfcnBoxPredictorTest(tf.test.TestCase):
self.assertAllEqual(class_predictions_shape, [8, 1, 3])
class ConvolutionalBoxPredictorTest(tf.test.TestCase):
class ConvolutionalBoxPredictorTest(test_case.TestCase):
def _build_arg_scope_with_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
......@@ -192,7 +199,94 @@ class ConvolutionalBoxPredictorTest(tf.test.TestCase):
return hyperparams_builder.build(conv_hyperparams, is_training=True)
def test_get_boxes_for_five_aspect_ratios_per_location(self):
image_features = tf.random_uniform([4, 8, 8, 64], dtype=tf.float32)
def graph_fn(image_features):
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
)
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
objectness_predictions = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
return (box_encodings, objectness_predictions)
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
(box_encodings, objectness_predictions) = self.execute(graph_fn,
[image_features])
self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
self.assertAllEqual(objectness_predictions.shape, [4, 320, 1])
def test_get_boxes_for_one_aspect_ratio_per_location(self):
def graph_fn(image_features):
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
)
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[1],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
objectness_predictions = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
return (box_encodings, objectness_predictions)
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
(box_encodings, objectness_predictions) = self.execute(graph_fn,
[image_features])
self.assertAllEqual(box_encodings.shape, [4, 64, 1, 4])
self.assertAllEqual(objectness_predictions.shape, [4, 64, 1])
def test_get_multi_class_predictions_for_five_aspect_ratios_per_location(
self):
num_classes_without_background = 6
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
def graph_fn(image_features):
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
)
box_predictions = conv_box_predictor.predict(
[image_features],
num_predictions_per_location=[5],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
class_predictions_with_background = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
return (box_encodings, class_predictions_with_background)
(box_encodings,
class_predictions_with_background) = self.execute(graph_fn,
[image_features])
self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
self.assertAllEqual(class_predictions_with_background.shape,
[4, 320, num_classes_without_background+1])
def test_get_predictions_with_feature_maps_of_dynamic_shape(
self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
......@@ -206,22 +300,38 @@ class ConvolutionalBoxPredictorTest(tf.test.TestCase):
box_code_size=4
)
box_predictions = conv_box_predictor.predict(
image_features, num_predictions_per_location=5, scope='BoxPredictor')
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
objectness_predictions = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
init_op = tf.global_variables_initializer()
resolution = 32
expected_num_anchors = resolution*resolution*5
with self.test_session() as sess:
sess.run(init_op)
(box_encodings_shape,
objectness_predictions_shape) = sess.run(
[tf.shape(box_encodings), tf.shape(objectness_predictions)])
self.assertAllEqual(box_encodings_shape, [4, 320, 1, 4])
self.assertAllEqual(objectness_predictions_shape, [4, 320, 1])
[tf.shape(box_encodings), tf.shape(objectness_predictions)],
feed_dict={image_features:
np.random.rand(4, resolution, resolution, 64)})
actual_variable_set = set(
[var.op.name for var in tf.trainable_variables()])
self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4])
self.assertAllEqual(objectness_predictions_shape,
[4, expected_num_anchors, 1])
expected_variable_set = set([
'BoxPredictor/Conv2d_0_1x1_32/biases',
'BoxPredictor/Conv2d_0_1x1_32/weights',
'BoxPredictor/BoxEncodingPredictor/biases',
'BoxPredictor/BoxEncodingPredictor/weights',
'BoxPredictor/ClassPredictor/biases',
'BoxPredictor/ClassPredictor/weights'])
self.assertEqual(expected_variable_set, actual_variable_set)
def test_get_boxes_for_one_aspect_ratio_per_location(self):
image_features = tf.random_uniform([4, 8, 8, 64], dtype=tf.float32)
def test_use_depthwise_convolution(self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
......@@ -229,77 +339,210 @@ class ConvolutionalBoxPredictorTest(tf.test.TestCase):
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
box_code_size=4,
use_dropout=True,
use_depthwise=True
)
box_predictions = conv_box_predictor.predict(
image_features, num_predictions_per_location=1, scope='BoxPredictor')
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
objectness_predictions = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
init_op = tf.global_variables_initializer()
resolution = 32
expected_num_anchors = resolution*resolution*5
with self.test_session() as sess:
sess.run(init_op)
(box_encodings_shape,
objectness_predictions_shape) = sess.run(
[tf.shape(box_encodings), tf.shape(objectness_predictions)])
self.assertAllEqual(box_encodings_shape, [4, 64, 1, 4])
self.assertAllEqual(objectness_predictions_shape, [4, 64, 1])
[tf.shape(box_encodings), tf.shape(objectness_predictions)],
feed_dict={image_features:
np.random.rand(4, resolution, resolution, 64)})
actual_variable_set = set(
[var.op.name for var in tf.trainable_variables()])
self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4])
self.assertAllEqual(objectness_predictions_shape,
[4, expected_num_anchors, 1])
expected_variable_set = set([
'BoxPredictor/Conv2d_0_1x1_32/biases',
'BoxPredictor/Conv2d_0_1x1_32/weights',
'BoxPredictor/BoxEncodingPredictor_depthwise/biases',
'BoxPredictor/BoxEncodingPredictor_depthwise/depthwise_weights',
'BoxPredictor/BoxEncodingPredictor/biases',
'BoxPredictor/BoxEncodingPredictor/weights',
'BoxPredictor/ClassPredictor_depthwise/biases',
'BoxPredictor/ClassPredictor_depthwise/depthwise_weights',
'BoxPredictor/ClassPredictor/biases',
'BoxPredictor/ClassPredictor/weights'])
self.assertEqual(expected_variable_set, actual_variable_set)
class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase):
def _build_arg_scope_with_conv_hyperparams(self):
conv_hyperparams = hyperparams_pb2.Hyperparams()
conv_hyperparams_text_proto = """
activation: RELU_6
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
"""
text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams)
return hyperparams_builder.build(conv_hyperparams, is_training=True)
def test_get_boxes_for_five_aspect_ratios_per_location(self):
def graph_fn(image_features):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4)
box_predictions = conv_box_predictor.predict(
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
objectness_predictions = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
return (box_encodings, objectness_predictions)
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
(box_encodings, objectness_predictions) = self.execute(
graph_fn, [image_features])
self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
self.assertAllEqual(objectness_predictions.shape, [4, 320, 1])
def test_get_multi_class_predictions_for_five_aspect_ratios_per_location(
self):
num_classes_without_background = 6
image_features = tf.random_uniform([4, 8, 8, 64], dtype=tf.float32)
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
)
box_predictions = conv_box_predictor.predict(
image_features,
num_predictions_per_location=5,
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
class_predictions_with_background = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
def graph_fn(image_features):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4)
box_predictions = conv_box_predictor.predict(
[image_features],
num_predictions_per_location=[5],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
class_predictions_with_background = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
return (box_encodings, class_predictions_with_background)
init_op = tf.global_variables_initializer()
with self.test_session() as sess:
sess.run(init_op)
(box_encodings_shape, class_predictions_with_background_shape
) = sess.run([
tf.shape(box_encodings), tf.shape(class_predictions_with_background)])
self.assertAllEqual(box_encodings_shape, [4, 320, 1, 4])
self.assertAllEqual(class_predictions_with_background_shape,
[4, 320, num_classes_without_background+1])
def test_get_boxes_for_five_aspect_ratios_per_location_fully_convolutional(
image_features = np.random.rand(4, 8, 8, 64).astype(np.float32)
(box_encodings, class_predictions_with_background) = self.execute(
graph_fn, [image_features])
self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4])
self.assertAllEqual(class_predictions_with_background.shape,
[4, 320, num_classes_without_background+1])
def test_get_multi_class_predictions_from_two_feature_maps(
self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=1,
box_code_size=4)
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
class_predictions_with_background = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
return (box_encodings, class_predictions_with_background)
image_features1 = np.random.rand(4, 8, 8, 64).astype(np.float32)
image_features2 = np.random.rand(4, 8, 8, 64).astype(np.float32)
(box_encodings, class_predictions_with_background) = self.execute(
graph_fn, [image_features1, image_features2])
self.assertAllEqual(box_encodings.shape, [4, 640, 1, 4])
self.assertAllEqual(class_predictions_with_background.shape,
[4, 640, num_classes_without_background+1])
def test_predictions_from_multiple_feature_maps_share_weights(self):
num_classes_without_background = 6
def graph_fn(image_features1, image_features2):
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=num_classes_without_background,
conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
depth=32,
num_layers_before_predictor=2,
box_code_size=4)
box_predictions = conv_box_predictor.predict(
[image_features1, image_features2],
num_predictions_per_location=[5, 5],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
class_predictions_with_background = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
return (box_encodings, class_predictions_with_background)
with self.test_session(graph=tf.Graph()):
graph_fn(tf.random_uniform([4, 32, 32, 3], dtype=tf.float32),
tf.random_uniform([4, 16, 16, 3], dtype=tf.float32))
actual_variable_set = set(
[var.op.name for var in tf.trainable_variables()])
expected_variable_set = set([
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxEncodingPredictionTower/conv2d_0/weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxEncodingPredictionTower/conv2d_0/biases'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxEncodingPredictionTower/conv2d_1/weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxEncodingPredictionTower/conv2d_1/biases'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_0/weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_0/biases'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_1/weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictionTower/conv2d_1/biases'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxEncodingPredictor/weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'BoxEncodingPredictor/biases'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictor/weights'),
('BoxPredictor/WeightSharedConvolutionalBoxPredictor/'
'ClassPredictor/biases')])
self.assertEqual(expected_variable_set, actual_variable_set)
def test_get_predictions_with_feature_maps_of_dynamic_shape(
self):
image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64])
conv_box_predictor = box_predictor.ConvolutionalBoxPredictor(
conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor(
is_training=False,
num_classes=0,
conv_hyperparams=self._build_arg_scope_with_conv_hyperparams(),
min_depth=0,
max_depth=32,
depth=32,
num_layers_before_predictor=1,
use_dropout=True,
dropout_keep_prob=0.8,
kernel_size=1,
box_code_size=4
)
box_code_size=4)
box_predictions = conv_box_predictor.predict(
image_features, num_predictions_per_location=5, scope='BoxPredictor')
[image_features], num_predictions_per_location=[5],
scope='BoxPredictor')
box_encodings = box_predictions[box_predictor.BOX_ENCODINGS]
objectness_predictions = box_predictions[
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND]
......@@ -318,6 +561,5 @@ class ConvolutionalBoxPredictorTest(tf.test.TestCase):
self.assertAllEqual(objectness_predictions_shape,
[4, expected_num_anchors, 1])
if __name__ == '__main__':
tf.test.main()
......@@ -50,8 +50,10 @@ class Loss(object):
"""Call the loss function.
Args:
prediction_tensor: a tensor representing predicted quantities.
target_tensor: a tensor representing regression or classification targets.
prediction_tensor: an N-d tensor of shape [batch, anchors, ...]
representing predicted quantities.
target_tensor: an N-d tensor of shape [batch, anchors, ...] representing
regression or classification targets.
ignore_nan_targets: whether to ignore nan targets in the loss computation.
E.g. can be used if the target tensor is missing groundtruth data that
shouldn't be factored into the loss.
......@@ -81,7 +83,8 @@ class Loss(object):
the Loss.
Returns:
loss: a tensor representing the value of the loss function
loss: an N-d tensor of shape [batch, anchors, ...] containing the loss per
anchor
"""
pass
......@@ -92,15 +95,6 @@ class WeightedL2LocalizationLoss(Loss):
Loss[b,a] = .5 * ||weights[b,a] * (prediction[b,a,:] - target[b,a,:])||^2
"""
def __init__(self, anchorwise_output=False):
"""Constructor.
Args:
anchorwise_output: Outputs loss per anchor. (default False)
"""
self._anchorwise_output = anchorwise_output
def _compute_loss(self, prediction_tensor, target_tensor, weights):
"""Compute loss function.
......@@ -112,15 +106,13 @@ class WeightedL2LocalizationLoss(Loss):
weights: a float tensor of shape [batch_size, num_anchors]
Returns:
loss: a (scalar) tensor representing the value of the loss function
or a float tensor of shape [batch_size, num_anchors]
loss: a float tensor of shape [batch_size, num_anchors] tensor
representing the value of the loss function.
"""
weighted_diff = (prediction_tensor - target_tensor) * tf.expand_dims(
weights, 2)
square_diff = 0.5 * tf.square(weighted_diff)
if self._anchorwise_output:
return tf.reduce_sum(square_diff, 2)
return tf.reduce_sum(square_diff)
return tf.reduce_sum(square_diff, 2)
class WeightedSmoothL1LocalizationLoss(Loss):
......@@ -132,15 +124,6 @@ class WeightedSmoothL1LocalizationLoss(Loss):
See also Equation (3) in the Fast R-CNN paper by Ross Girshick (ICCV 2015)
"""
def __init__(self, anchorwise_output=False):
"""Constructor.
Args:
anchorwise_output: Outputs loss per anchor. (default False)
"""
self._anchorwise_output = anchorwise_output
def _compute_loss(self, prediction_tensor, target_tensor, weights):
"""Compute loss function.
......@@ -152,7 +135,8 @@ class WeightedSmoothL1LocalizationLoss(Loss):
weights: a float tensor of shape [batch_size, num_anchors]
Returns:
loss: a (scalar) tensor representing the value of the loss function
loss: a float tensor of shape [batch_size, num_anchors] tensor
representing the value of the loss function.
"""
diff = prediction_tensor - target_tensor
abs_diff = tf.abs(diff)
......@@ -160,9 +144,7 @@ class WeightedSmoothL1LocalizationLoss(Loss):
anchorwise_smooth_l1norm = tf.reduce_sum(
tf.where(abs_diff_lt_1, 0.5 * tf.square(abs_diff), abs_diff - 0.5),
2) * weights
if self._anchorwise_output:
return anchorwise_smooth_l1norm
return tf.reduce_sum(anchorwise_smooth_l1norm)
return anchorwise_smooth_l1norm
class WeightedIOULocalizationLoss(Loss):
......@@ -184,27 +166,19 @@ class WeightedIOULocalizationLoss(Loss):
weights: a float tensor of shape [batch_size, num_anchors]
Returns:
loss: a (scalar) tensor representing the value of the loss function
loss: a float tensor of shape [batch_size, num_anchors] tensor
representing the value of the loss function.
"""
predicted_boxes = box_list.BoxList(tf.reshape(prediction_tensor, [-1, 4]))
target_boxes = box_list.BoxList(tf.reshape(target_tensor, [-1, 4]))
per_anchor_iou_loss = 1.0 - box_list_ops.matched_iou(predicted_boxes,
target_boxes)
return tf.reduce_sum(tf.reshape(weights, [-1]) * per_anchor_iou_loss)
return tf.reshape(weights, [-1]) * per_anchor_iou_loss
class WeightedSigmoidClassificationLoss(Loss):
"""Sigmoid cross entropy classification loss function."""
def __init__(self, anchorwise_output=False):
"""Constructor.
Args:
anchorwise_output: Outputs loss per anchor. (default False)
"""
self._anchorwise_output = anchorwise_output
def _compute_loss(self,
prediction_tensor,
target_tensor,
......@@ -222,8 +196,8 @@ class WeightedSigmoidClassificationLoss(Loss):
If provided, computes loss only for the specified class indices.
Returns:
loss: a (scalar) tensor representing the value of the loss function
or a float tensor of shape [batch_size, num_anchors]
loss: a float tensor of shape [batch_size, num_anchors, num_classes]
representing the value of the loss function.
"""
weights = tf.expand_dims(weights, 2)
if class_indices is not None:
......@@ -233,9 +207,7 @@ class WeightedSigmoidClassificationLoss(Loss):
[1, 1, -1])
per_entry_cross_ent = (tf.nn.sigmoid_cross_entropy_with_logits(
labels=target_tensor, logits=prediction_tensor))
if self._anchorwise_output:
return tf.reduce_sum(per_entry_cross_ent * weights, 2)
return tf.reduce_sum(per_entry_cross_ent * weights)
return per_entry_cross_ent * weights
class SigmoidFocalClassificationLoss(Loss):
......@@ -245,15 +217,13 @@ class SigmoidFocalClassificationLoss(Loss):
examples. See https://arxiv.org/pdf/1708.02002.pdf for the loss definition.
"""
def __init__(self, anchorwise_output=False, gamma=2.0, alpha=0.25):
def __init__(self, gamma=2.0, alpha=0.25):
"""Constructor.
Args:
anchorwise_output: Outputs loss per anchor. (default False)
gamma: exponent of the modulating factor (1 - p_t) ^ gamma.
alpha: optional alpha weighting factor to balance positives vs negatives.
"""
self._anchorwise_output = anchorwise_output
self._alpha = alpha
self._gamma = gamma
......@@ -274,8 +244,8 @@ class SigmoidFocalClassificationLoss(Loss):
If provided, computes loss only for the specified class indices.
Returns:
loss: a (scalar) tensor representing the value of the loss function
or a float tensor of shape [batch_size, num_anchors]
loss: a float tensor of shape [batch_size, num_anchors, num_classes]
representing the value of the loss function.
"""
weights = tf.expand_dims(weights, 2)
if class_indices is not None:
......@@ -297,25 +267,21 @@ class SigmoidFocalClassificationLoss(Loss):
(1 - target_tensor) * (1 - self._alpha))
focal_cross_entropy_loss = (modulating_factor * alpha_weight_factor *
per_entry_cross_ent)
if self._anchorwise_output:
return tf.reduce_sum(focal_cross_entropy_loss * weights, 2)
return tf.reduce_sum(focal_cross_entropy_loss * weights)
return focal_cross_entropy_loss * weights
class WeightedSoftmaxClassificationLoss(Loss):
"""Softmax loss function."""
def __init__(self, anchorwise_output=False, logit_scale=1.0):
def __init__(self, logit_scale=1.0):
"""Constructor.
Args:
anchorwise_output: Whether to output loss per anchor (default False)
logit_scale: When this value is high, the prediction is "diffused" and
when this value is low, the prediction is made peakier.
(default 1.0)
"""
self._anchorwise_output = anchorwise_output
self._logit_scale = logit_scale
def _compute_loss(self, prediction_tensor, target_tensor, weights):
......@@ -329,7 +295,8 @@ class WeightedSoftmaxClassificationLoss(Loss):
weights: a float tensor of shape [batch_size, num_anchors]
Returns:
loss: a (scalar) tensor representing the value of the loss function
loss: a float tensor of shape [batch_size, num_anchors]
representing the value of the loss function.
"""
num_classes = prediction_tensor.get_shape().as_list()[-1]
prediction_tensor = tf.divide(
......@@ -337,9 +304,7 @@ class WeightedSoftmaxClassificationLoss(Loss):
per_row_cross_ent = (tf.nn.softmax_cross_entropy_with_logits(
labels=tf.reshape(target_tensor, [-1, num_classes]),
logits=tf.reshape(prediction_tensor, [-1, num_classes])))
if self._anchorwise_output:
return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights
return tf.reduce_sum(per_row_cross_ent * tf.reshape(weights, [-1]))
return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights
class BootstrappedSigmoidClassificationLoss(Loss):
......@@ -359,14 +324,13 @@ class BootstrappedSigmoidClassificationLoss(Loss):
Reed et al. (ICLR 2015).
"""
def __init__(self, alpha, bootstrap_type='soft', anchorwise_output=False):
def __init__(self, alpha, bootstrap_type='soft'):
"""Constructor.
Args:
alpha: a float32 scalar tensor between 0 and 1 representing interpolation
weight
bootstrap_type: set to either 'hard' or 'soft' (default)
anchorwise_output: Outputs loss per anchor. (default False)
Raises:
ValueError: if bootstrap_type is not either 'hard' or 'soft'
......@@ -376,7 +340,6 @@ class BootstrappedSigmoidClassificationLoss(Loss):
'\'hard\' or \'soft.\'')
self._alpha = alpha
self._bootstrap_type = bootstrap_type
self._anchorwise_output = anchorwise_output
def _compute_loss(self, prediction_tensor, target_tensor, weights):
"""Compute loss function.
......@@ -389,8 +352,8 @@ class BootstrappedSigmoidClassificationLoss(Loss):
weights: a float tensor of shape [batch_size, num_anchors]
Returns:
loss: a (scalar) tensor representing the value of the loss function
or a float tensor of shape [batch_size, num_anchors]
loss: a float tensor of shape [batch_size, num_anchors, num_classes]
representing the value of the loss function.
"""
if self._bootstrap_type == 'soft':
bootstrap_target_tensor = self._alpha * target_tensor + (
......@@ -401,9 +364,7 @@ class BootstrappedSigmoidClassificationLoss(Loss):
tf.sigmoid(prediction_tensor) > 0.5, tf.float32)
per_entry_cross_ent = (tf.nn.sigmoid_cross_entropy_with_logits(
labels=bootstrap_target_tensor, logits=prediction_tensor))
if self._anchorwise_output:
return tf.reduce_sum(per_entry_cross_ent * tf.expand_dims(weights, 2), 2)
return tf.reduce_sum(per_entry_cross_ent * tf.expand_dims(weights, 2))
return per_entry_cross_ent * tf.expand_dims(weights, 2)
class HardExampleMiner(object):
......
......@@ -26,7 +26,7 @@ from object_detection.core import matcher
class WeightedL2LocalizationLossTest(tf.test.TestCase):
def testReturnsCorrectLoss(self):
def testReturnsCorrectWeightedLoss(self):
batch_size = 3
num_anchors = 10
code_size = 4
......@@ -36,7 +36,8 @@ class WeightedL2LocalizationLossTest(tf.test.TestCase):
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]], tf.float32)
loss_op = losses.WeightedL2LocalizationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss_op(prediction_tensor, target_tensor,
weights=weights))
expected_loss = (3 * 5 * 4) / 2.0
with self.test_session() as sess:
......@@ -50,7 +51,7 @@ class WeightedL2LocalizationLossTest(tf.test.TestCase):
prediction_tensor = tf.ones([batch_size, num_anchors, code_size])
target_tensor = tf.zeros([batch_size, num_anchors, code_size])
weights = tf.ones([batch_size, num_anchors])
loss_op = losses.WeightedL2LocalizationLoss(anchorwise_output=True)
loss_op = losses.WeightedL2LocalizationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
expected_loss = np.ones((batch_size, num_anchors)) * 2
......@@ -58,22 +59,6 @@ class WeightedL2LocalizationLossTest(tf.test.TestCase):
loss_output = sess.run(loss)
self.assertAllClose(loss_output, expected_loss)
def testReturnsCorrectLossSum(self):
batch_size = 3
num_anchors = 16
code_size = 4
prediction_tensor = tf.ones([batch_size, num_anchors, code_size])
target_tensor = tf.zeros([batch_size, num_anchors, code_size])
weights = tf.ones([batch_size, num_anchors])
loss_op = losses.WeightedL2LocalizationLoss(anchorwise_output=False)
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
expected_loss = tf.nn.l2_loss(prediction_tensor - target_tensor)
with self.test_session() as sess:
loss_output = sess.run(loss)
expected_loss_output = sess.run(expected_loss)
self.assertAllClose(loss_output, expected_loss_output)
def testReturnsCorrectNanLoss(self):
batch_size = 3
num_anchors = 10
......@@ -87,6 +72,7 @@ class WeightedL2LocalizationLossTest(tf.test.TestCase):
loss_op = losses.WeightedL2LocalizationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights,
ignore_nan_targets=True)
loss = tf.reduce_sum(loss)
expected_loss = (3 * 5 * 4) / 2.0
with self.test_session() as sess:
......@@ -111,6 +97,7 @@ class WeightedSmoothL1LocalizationLossTest(tf.test.TestCase):
[0, 3, 0]], tf.float32)
loss_op = losses.WeightedSmoothL1LocalizationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss)
exp_loss = 7.695
with self.test_session() as sess:
......@@ -130,6 +117,7 @@ class WeightedIOULocalizationLossTest(tf.test.TestCase):
weights = [[1.0, .5, 2.0]]
loss_op = losses.WeightedIOULocalizationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss)
exp_loss = 2.0
with self.test_session() as sess:
loss_output = sess.run(loss)
......@@ -159,6 +147,7 @@ class WeightedSigmoidClassificationLossTest(tf.test.TestCase):
[1, 1, 1, 0]], tf.float32)
loss_op = losses.WeightedSigmoidClassificationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss)
exp_loss = -2 * math.log(.5)
with self.test_session() as sess:
......@@ -184,8 +173,9 @@ class WeightedSigmoidClassificationLossTest(tf.test.TestCase):
[1, 0, 0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1],
[1, 1, 1, 0]], tf.float32)
loss_op = losses.WeightedSigmoidClassificationLoss(True)
loss_op = losses.WeightedSigmoidClassificationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss, axis=2)
exp_loss = np.matrix([[0, 0, -math.log(.5), 0],
[-math.log(.5), 0, 0, 0]])
......@@ -214,9 +204,10 @@ class WeightedSigmoidClassificationLossTest(tf.test.TestCase):
[1, 1, 1, 0]], tf.float32)
# Ignores the last class.
class_indices = tf.constant([0, 1, 2], tf.int32)
loss_op = losses.WeightedSigmoidClassificationLoss(True)
loss_op = losses.WeightedSigmoidClassificationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights,
class_indices=class_indices)
loss = tf.reduce_sum(loss, axis=2)
exp_loss = np.matrix([[0, 0, -math.log(.5), 0],
[-math.log(.5), 0, 0, 0]])
......@@ -245,14 +236,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
[0],
[0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1, 1, 1]], tf.float32)
focal_loss_op = losses.SigmoidFocalClassificationLoss(
anchorwise_output=True, gamma=2.0, alpha=None)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
anchorwise_output=True)
focal_loss = focal_loss_op(prediction_tensor, target_tensor,
weights=weights)
sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
weights=weights)
focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=None)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
weights=weights), axis=2)
sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
target_tensor,
weights=weights), axis=2)
with self.test_session() as sess:
sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
......@@ -272,14 +262,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
[0],
[0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1, 1]], tf.float32)
focal_loss_op = losses.SigmoidFocalClassificationLoss(
anchorwise_output=True, gamma=2.0, alpha=None)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
anchorwise_output=True)
focal_loss = focal_loss_op(prediction_tensor, target_tensor,
weights=weights)
sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
weights=weights)
focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=None)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
weights=weights), axis=2)
sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
target_tensor,
weights=weights), axis=2)
with self.test_session() as sess:
sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
......@@ -299,14 +288,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
[0],
[0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1, 1]], tf.float32)
focal_loss_op = losses.SigmoidFocalClassificationLoss(
anchorwise_output=False, gamma=2.0, alpha=None)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
anchorwise_output=False)
focal_loss = focal_loss_op(prediction_tensor, target_tensor,
weights=weights)
sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
weights=weights)
focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=None)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
weights=weights))
sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
target_tensor,
weights=weights))
with self.test_session() as sess:
sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
......@@ -326,14 +314,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
[0],
[0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1, 1]], tf.float32)
focal_loss_op = losses.SigmoidFocalClassificationLoss(
anchorwise_output=True, gamma=2.0, alpha=1.0)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
anchorwise_output=True)
focal_loss = focal_loss_op(prediction_tensor, target_tensor,
weights=weights)
sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
weights=weights)
focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=1.0)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
weights=weights), axis=2)
sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
target_tensor,
weights=weights), axis=2)
with self.test_session() as sess:
sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
......@@ -355,14 +342,13 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
[0],
[0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1, 1]], tf.float32)
focal_loss_op = losses.SigmoidFocalClassificationLoss(
anchorwise_output=True, gamma=2.0, alpha=0.0)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
anchorwise_output=True)
focal_loss = focal_loss_op(prediction_tensor, target_tensor,
weights=weights)
sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
weights=weights)
focal_loss_op = losses.SigmoidFocalClassificationLoss(gamma=2.0, alpha=0.0)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
weights=weights), axis=2)
sigmoid_loss = tf.reduce_sum(sigmoid_loss_op(prediction_tensor,
target_tensor,
weights=weights), axis=2)
with self.test_session() as sess:
sigmoid_loss, focal_loss = sess.run([sigmoid_loss, focal_loss])
......@@ -391,10 +377,8 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
[1, 0, 0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1],
[1, 1, 1, 0]], tf.float32)
focal_loss_op = losses.SigmoidFocalClassificationLoss(
anchorwise_output=True, alpha=0.5, gamma=0.0)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
anchorwise_output=True)
focal_loss_op = losses.SigmoidFocalClassificationLoss(alpha=0.5, gamma=0.0)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
focal_loss = focal_loss_op(prediction_tensor, target_tensor,
weights=weights)
sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
......@@ -423,10 +407,8 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
[1, 0, 0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1],
[1, 1, 1, 0]], tf.float32)
focal_loss_op = losses.SigmoidFocalClassificationLoss(
anchorwise_output=True, alpha=None, gamma=0.0)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss(
anchorwise_output=True)
focal_loss_op = losses.SigmoidFocalClassificationLoss(alpha=None, gamma=0.0)
sigmoid_loss_op = losses.WeightedSigmoidClassificationLoss()
focal_loss = focal_loss_op(prediction_tensor, target_tensor,
weights=weights)
sigmoid_loss = sigmoid_loss_op(prediction_tensor, target_tensor,
......@@ -456,11 +438,10 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
[1, 0, 0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1],
[1, 1, 1, 1]], tf.float32)
focal_loss_op = losses.SigmoidFocalClassificationLoss(
anchorwise_output=False, alpha=1.0, gamma=0.0)
focal_loss_op = losses.SigmoidFocalClassificationLoss(alpha=1.0, gamma=0.0)
focal_loss = focal_loss_op(prediction_tensor, target_tensor,
weights=weights)
focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
weights=weights))
with self.test_session() as sess:
focal_loss = sess.run(focal_loss)
self.assertAllClose(
......@@ -489,11 +470,10 @@ class SigmoidFocalClassificationLossTest(tf.test.TestCase):
[1, 0, 0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1],
[1, 1, 1, 1]], tf.float32)
focal_loss_op = losses.SigmoidFocalClassificationLoss(
anchorwise_output=False, alpha=0.75, gamma=0.0)
focal_loss_op = losses.SigmoidFocalClassificationLoss(alpha=0.75, gamma=0.0)
focal_loss = focal_loss_op(prediction_tensor, target_tensor,
weights=weights)
focal_loss = tf.reduce_sum(focal_loss_op(prediction_tensor, target_tensor,
weights=weights))
with self.test_session() as sess:
focal_loss = sess.run(focal_loss)
self.assertAllClose(
......@@ -528,6 +508,7 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
[1, 1, 1, 0]], tf.float32)
loss_op = losses.WeightedSoftmaxClassificationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss)
exp_loss = - 1.5 * math.log(.5)
with self.test_session() as sess:
......@@ -553,7 +534,7 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
[1, 0, 0]]], tf.float32)
weights = tf.constant([[1, 1, .5, 1],
[1, 1, 1, 0]], tf.float32)
loss_op = losses.WeightedSoftmaxClassificationLoss(True)
loss_op = losses.WeightedSoftmaxClassificationLoss()
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
exp_loss = np.matrix([[0, 0, - 0.5 * math.log(.5), 0],
......@@ -564,7 +545,7 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
def testReturnsCorrectAnchorWiseLossWithHighLogitScaleSetting(self):
"""At very high logit_scale, all predictions will be ~0.33."""
# TODO(yonib): Also test logit_scale with anchorwise=False.
# TODO: Also test logit_scale with anchorwise=False.
logit_scale = 10e16
prediction_tensor = tf.constant([[[-100, 100, -100],
[100, -100, -100],
......@@ -584,8 +565,7 @@ class WeightedSoftmaxClassificationLossTest(tf.test.TestCase):
[1, 0, 0]]], tf.float32)
weights = tf.constant([[1, 1, 1, 1],
[1, 1, 1, 1]], tf.float32)
loss_op = losses.WeightedSoftmaxClassificationLoss(
anchorwise_output=True, logit_scale=logit_scale)
loss_op = losses.WeightedSoftmaxClassificationLoss(logit_scale=logit_scale)
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
uniform_distribution_loss = - math.log(.33333333333)
......@@ -621,6 +601,7 @@ class BootstrappedSigmoidClassificationLossTest(tf.test.TestCase):
loss_op = losses.BootstrappedSigmoidClassificationLoss(
alpha, bootstrap_type='soft')
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss)
exp_loss = -math.log(.5)
with self.test_session() as sess:
loss_output = sess.run(loss)
......@@ -649,6 +630,7 @@ class BootstrappedSigmoidClassificationLossTest(tf.test.TestCase):
loss_op = losses.BootstrappedSigmoidClassificationLoss(
alpha, bootstrap_type='hard')
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss)
exp_loss = -math.log(.5)
with self.test_session() as sess:
loss_output = sess.run(loss)
......@@ -675,9 +657,9 @@ class BootstrappedSigmoidClassificationLossTest(tf.test.TestCase):
[1, 1, 1, 0]], tf.float32)
alpha = tf.constant(.5, tf.float32)
loss_op = losses.BootstrappedSigmoidClassificationLoss(
alpha, bootstrap_type='hard', anchorwise_output=True)
alpha, bootstrap_type='hard')
loss = loss_op(prediction_tensor, target_tensor, weights=weights)
loss = tf.reduce_sum(loss, axis=2)
exp_loss = np.matrix([[0, 0, -math.log(.5), 0],
[-math.log(.5), 0, 0, 0]])
with self.test_session() as sess:
......
......@@ -36,6 +36,8 @@ from abc import abstractmethod
import tensorflow as tf
from object_detection.utils import ops
class Match(object):
"""Class to store results from the matcher.
......@@ -44,7 +46,7 @@ class Match(object):
convenient methods to query the matching results.
"""
def __init__(self, match_results):
def __init__(self, match_results, use_matmul_gather=False):
"""Constructs a Match object.
Args:
......@@ -52,6 +54,8 @@ class Match(object):
meaning that column i is matched with row match_results[i].
(2) match_results[i]=-1, meaning that column i is not matched.
(3) match_results[i]=-2, meaning that column i is ignored.
use_matmul_gather: Use matrix multiplication based gather instead of
standard tf.gather. (Default: False).
Raises:
ValueError: if match_results does not have rank 1 or is not an
......@@ -63,6 +67,9 @@ class Match(object):
raise ValueError('match_results should be an int32 or int64 scalar '
'tensor')
self._match_results = match_results
self._gather_op = tf.gather
if use_matmul_gather:
self._gather_op = ops.matmul_gather_on_zeroth_axis
@property
def match_results(self):
......@@ -163,17 +170,55 @@ class Match(object):
row_indices: int32 tensor of shape [K] with row indices.
"""
return self._reshape_and_cast(
tf.gather(self._match_results, self.matched_column_indices()))
self._gather_op(self._match_results, self.matched_column_indices()))
def _reshape_and_cast(self, t):
return tf.cast(tf.reshape(t, [-1]), tf.int32)
def gather_based_on_match(self, input_tensor, unmatched_value,
ignored_value):
"""Gathers elements from `input_tensor` based on match results.
For columns that are matched to a row, gathered_tensor[col] is set to
input_tensor[match_results[col]]. For columns that are unmatched,
gathered_tensor[col] is set to unmatched_value. Finally, for columns that
are ignored gathered_tensor[col] is set to ignored_value.
Note that the input_tensor.shape[1:] must match with unmatched_value.shape
and ignored_value.shape
Args:
input_tensor: Tensor to gather values from.
unmatched_value: Constant tensor value for unmatched columns.
ignored_value: Constant tensor value for ignored columns.
Returns:
gathered_tensor: A tensor containing values gathered from input_tensor.
The shape of the gathered tensor is [match_results.shape[0]] +
input_tensor.shape[1:].
"""
input_tensor = tf.concat([tf.stack([ignored_value, unmatched_value]),
input_tensor], axis=0)
gather_indices = tf.maximum(self.match_results + 2, 0)
gathered_tensor = self._gather_op(input_tensor, gather_indices)
return gathered_tensor
class Matcher(object):
"""Abstract base class for matcher.
"""
__metaclass__ = ABCMeta
def __init__(self, use_matmul_gather=False):
"""Constructs a Matcher.
Args:
use_matmul_gather: Force constructed match objects to use matrix
multiplication based gather instead of standard tf.gather.
(Default: False).
"""
self._use_matmul_gather = use_matmul_gather
def match(self, similarity_matrix, scope=None, **params):
"""Computes matches among row and column indices and returns the result.
......@@ -191,11 +236,12 @@ class Matcher(object):
A Match object with the results of matching.
"""
with tf.name_scope(scope, 'Match', [similarity_matrix, params]) as scope:
return Match(self._match(similarity_matrix, **params))
return Match(self._match(similarity_matrix, **params),
self._use_matmul_gather)
@abstractmethod
def _match(self, similarity_matrix, **params):
"""Method to be overriden by implementations.
"""Method to be overridden by implementations.
Args:
similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment