Unverified Commit ed4e22b8 authored by pkulzc's avatar pkulzc Committed by GitHub
Browse files

Merge pull request #3973 from pkulzc/master

Object detection internal changes
parents cac90a0e 13b89b93
......@@ -21,6 +21,7 @@ import tensorflow as tf
from object_detection.meta_architectures import ssd_meta_arch
from object_detection.models import feature_map_generators
from object_detection.utils import context_manager
from object_detection.utils import ops
from object_detection.utils import shape_utils
from nets import resnet_v1
......@@ -36,15 +37,14 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams,
conv_hyperparams_fn,
resnet_base_fn,
resnet_scope_name,
fpn_scope_name,
batch_norm_trainable=True,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
inplace_batchnorm_update=False):
override_base_feature_extractor_hyperparams=False):
"""SSD FPN feature extractor based on Resnet v1 architecture.
Args:
......@@ -54,32 +54,28 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
min_depth: minimum feature extractor depth. UNUSED Currently.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
resnet_base_fn: base resnet network to use.
resnet_scope_name: scope name under which to construct resnet
fpn_scope_name: scope name under which to construct the feature pyramid
network.
batch_norm_trainable: Whether to update batch norm parameters during
training or not. When training with a small batch size
(e.g. 1), it is desirable to disable batch norm update and use
pretrained batch norm params.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently.
use_depthwise: Whether to use depthwise convolutions. UNUSED currently.
inplace_batchnorm_update: Whether to update batch_norm inplace during
training. This is required for batch norm to work correctly on TPUs.
When this is false, user must add a control dependency on
tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
norm moving average parameters.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
Raises:
ValueError: On supplying invalid arguments for unused arguments.
"""
super(_SSDResnetV1FpnFeatureExtractor, self).__init__(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, batch_norm_trainable, reuse_weights,
use_explicit_padding, inplace_batchnorm_update)
conv_hyperparams_fn, reuse_weights, use_explicit_padding,
override_base_feature_extractor_hyperparams)
if self._depth_multiplier != 1.0:
raise ValueError('Only depth 1.0 is supported, found: {}'.
format(self._depth_multiplier))
......@@ -116,7 +112,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
filtered_image_features[feature_name] = feature
return filtered_image_features
def _extract_features(self, preprocessed_inputs):
def extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
......@@ -139,11 +135,14 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
with tf.variable_scope(
self._resnet_scope_name, reuse=self._reuse_weights) as scope:
with slim.arg_scope(resnet_v1.resnet_arg_scope()):
with (slim.arg_scope(self._conv_hyperparams_fn())
if self._override_base_feature_extractor_hyperparams else
context_manager.IdentityContextManager()):
_, image_features = self._resnet_base_fn(
inputs=ops.pad_to_multiple(preprocessed_inputs,
self._pad_to_multiple),
num_classes=None,
is_training=self._is_training and self._batch_norm_trainable,
is_training=None,
global_pool=False,
output_stride=None,
store_non_strided_activations=True,
......@@ -151,7 +150,7 @@ class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
image_features = self._filter_features(image_features)
last_feature_map = image_features['block4']
with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights):
with slim.arg_scope(self._conv_hyperparams):
with slim.arg_scope(self._conv_hyperparams_fn()):
for i in range(5, 7):
last_feature_map = slim.conv2d(
last_feature_map,
......@@ -178,40 +177,36 @@ class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams,
batch_norm_trainable=True,
conv_hyperparams_fn,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
inplace_batchnorm_update=False):
"""Resnet50 v1 FPN Feature Extractor for SSD Models.
override_base_feature_extractor_hyperparams=False):
"""SSD Resnet50 V1 FPN feature extractor based on Resnet v1 architecture.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
UNUSED currently.
min_depth: minimum feature extractor depth. UNUSED Currently.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
batch_norm_trainable: Whether to update batch norm parameters during
training or not. When training with a small batch size
(e.g. 1), it is desirable to disable batch norm update and use
pretrained batch norm params.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently.
use_depthwise: Whether to use depthwise convolutions. UNUSED currently.
inplace_batchnorm_update: Whether to update batch_norm inplace during
training. This is required for batch norm to work correctly on TPUs.
When this is false, user must add a control dependency on
tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
norm moving average parameters.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
"""
super(SSDResnet50V1FpnFeatureExtractor, self).__init__(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, resnet_v1.resnet_v1_50, 'resnet_v1_50', 'fpn',
batch_norm_trainable, reuse_weights, use_explicit_padding,
inplace_batchnorm_update)
conv_hyperparams_fn, resnet_v1.resnet_v1_50, 'resnet_v1_50', 'fpn',
reuse_weights, use_explicit_padding,
override_base_feature_extractor_hyperparams)
class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
......@@ -221,40 +216,36 @@ class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams,
batch_norm_trainable=True,
conv_hyperparams_fn,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
inplace_batchnorm_update=False):
"""Resnet101 v1 FPN Feature Extractor for SSD Models.
override_base_feature_extractor_hyperparams=False):
"""SSD Resnet101 V1 FPN feature extractor based on Resnet v1 architecture.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
UNUSED currently.
min_depth: minimum feature extractor depth. UNUSED Currently.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
batch_norm_trainable: Whether to update batch norm parameters during
training or not. When training with a small batch size
(e.g. 1), it is desirable to disable batch norm update and use
pretrained batch norm params.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently.
use_depthwise: Whether to use depthwise convolutions. UNUSED currently.
inplace_batchnorm_update: Whether to update batch_norm inplace during
training. This is required for batch norm to work correctly on TPUs.
When this is false, user must add a control dependency on
tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
norm moving average parameters.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
"""
super(SSDResnet101V1FpnFeatureExtractor, self).__init__(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, resnet_v1.resnet_v1_101, 'resnet_v1_101', 'fpn',
batch_norm_trainable, reuse_weights, use_explicit_padding,
inplace_batchnorm_update)
conv_hyperparams_fn, resnet_v1.resnet_v1_101, 'resnet_v1_101', 'fpn',
reuse_weights, use_explicit_padding,
override_base_feature_extractor_hyperparams)
class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
......@@ -264,37 +255,33 @@ class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor):
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams,
batch_norm_trainable=True,
conv_hyperparams_fn,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=False,
inplace_batchnorm_update=False):
"""Resnet152 v1 FPN Feature Extractor for SSD Models.
override_base_feature_extractor_hyperparams=False):
"""SSD Resnet152 V1 FPN feature extractor based on Resnet v1 architecture.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
UNUSED currently.
min_depth: minimum feature extractor depth. UNUSED Currently.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams: tf slim arg_scope for conv2d and separable_conv2d ops.
batch_norm_trainable: Whether to update batch norm parameters during
training or not. When training with a small batch size
(e.g. 1), it is desirable to disable batch norm update and use
pretrained batch norm params.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False. UNUSED currently.
use_depthwise: Whether to use depthwise convolutions. UNUSED currently.
inplace_batchnorm_update: Whether to update batch_norm inplace during
training. This is required for batch norm to work correctly on TPUs.
When this is false, user must add a control dependency on
tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch
norm moving average parameters.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
"""
super(SSDResnet152V1FpnFeatureExtractor, self).__init__(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, resnet_v1.resnet_v1_152, 'resnet_v1_152', 'fpn',
batch_norm_trainable, reuse_weights, use_explicit_padding,
inplace_batchnorm_update)
conv_hyperparams_fn, resnet_v1.resnet_v1_152, 'resnet_v1_152', 'fpn',
reuse_weights, use_explicit_padding,
override_base_feature_extractor_hyperparams)
......@@ -27,13 +27,10 @@ class SSDResnet50V1FeatureExtractorTest(
def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
use_explicit_padding=False):
min_depth = 32
conv_hyperparams = {}
batch_norm_trainable = True
is_training = True
return ssd_resnet_v1_fpn_feature_extractor.SSDResnet50V1FpnFeatureExtractor(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, batch_norm_trainable,
use_explicit_padding=use_explicit_padding)
self.conv_hyperparams_fn, use_explicit_padding=use_explicit_padding)
def _resnet_scope_name(self):
return 'resnet_v1_50'
......@@ -47,13 +44,14 @@ class SSDResnet101V1FeatureExtractorTest(
def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
use_explicit_padding=False):
min_depth = 32
conv_hyperparams = {}
batch_norm_trainable = True
is_training = True
return (
ssd_resnet_v1_fpn_feature_extractor.SSDResnet101V1FpnFeatureExtractor(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, batch_norm_trainable,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
self.conv_hyperparams_fn,
use_explicit_padding=use_explicit_padding))
def _resnet_scope_name(self):
......@@ -68,13 +66,14 @@ class SSDResnet152V1FeatureExtractorTest(
def _create_feature_extractor(self, depth_multiplier, pad_to_multiple,
use_explicit_padding=False):
min_depth = 32
conv_hyperparams = {}
batch_norm_trainable = True
is_training = True
return (
ssd_resnet_v1_fpn_feature_extractor.SSDResnet152V1FpnFeatureExtractor(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, batch_norm_trainable,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
self.conv_hyperparams_fn,
use_explicit_padding=use_explicit_padding))
def _resnet_scope_name(self):
......
......@@ -118,6 +118,7 @@ message MaskRCNNBoxPredictor {
// The number of convolutions applied to image_features in the mask prediction
// branch.
optional int32 mask_prediction_num_conv_layers = 11 [default = 2];
optional bool masks_are_class_agnostic = 12 [default = false];
}
message RfcnBoxPredictor {
......
......@@ -60,6 +60,21 @@ message Ssd {
// Loss configuration for training.
optional Loss loss = 11;
// Whether to update batch norm parameters during training or not.
// When training with a relative small batch size (e.g. 1), it is
// desirable to disable batch norm update and use pretrained batch norm
// params.
//
// Note: Some feature extractors are used with canned arg_scopes
// (e.g resnet arg scopes). In these cases training behavior of batch norm
// variables may depend on both values of `batch_norm_trainable` and
// `is_training`.
//
// When canned arg_scopes are used with feature extractors `conv_hyperparams`
// will apply only to the additional layers that are added and are outside the
// canned arg_scope.
optional bool freeze_batchnorm = 16 [default = false];
// Whether to update batch_norm inplace during training. This is required
// for batch norm to work correctly on TPUs. When this is false, user must add
// a control dependency on tf.GraphKeys.UPDATE_OPS for train/loss op in order
......@@ -69,6 +84,8 @@ message Ssd {
message SsdFeatureExtractor {
reserved 6;
// Type of ssd feature extractor.
optional string type = 1;
......@@ -82,26 +99,19 @@ message SsdFeatureExtractor {
// of the base feature extractor.
optional Hyperparams conv_hyperparams = 4;
// Normally, SSD feature extractors are constructed by reusing an existing
// base feature extractor (that has its own hyperparams) and adding new layers
// on top of it. `conv_hyperparams` above normally applies only to the new
// layers while base feature extractor uses its own default hyperparams. If
// this value is set to true, the base feature extractor's hyperparams will be
// overridden with the `conv_hyperparams`.
optional bool override_base_feature_extractor_hyperparams = 9 [default = false];
// The nearest multiple to zero-pad the input height and width dimensions to.
// For example, if pad_to_multiple = 2, input dimensions are zero-padded
// until the resulting dimensions are even.
optional int32 pad_to_multiple = 5 [default = 1];
// Whether to update batch norm parameters during training or not.
// When training with a relative small batch size (e.g. 1), it is
// desirable to disable batch norm update and use pretrained batch norm
// params.
//
// Note: Some feature extractors are used with canned arg_scopes
// (e.g resnet arg scopes). In these cases training behavior of batch norm
// variables may depend on both values of `batch_norm_trainable` and
// `is_training`.
//
// When canned arg_scopes are used with feature extractors `conv_hyperparams`
// will apply only to the additional layers that are added and are outside the
// canned arg_scope.
optional bool batch_norm_trainable = 6 [default=true];
// Whether to use explicit padding when extracting SSD multiresolution
// features. Note that this does not apply to the base feature extractor.
optional bool use_explicit_padding = 7 [default=false];
......
......@@ -6,8 +6,11 @@ import "object_detection/protos/optimizer.proto";
import "object_detection/protos/preprocessor.proto";
// Message for configuring DetectionModel training jobs (train.py).
// Next id: 25
message TrainConfig {
// Input queue batch size.
// Effective batch size to use for training.
// For TPU (or sync SGD jobs), the batch size per core (or GPU) is going to be
// `batch_size` / number of cores (or `batch_size` / number of GPUs).
optional uint32 batch_size = 1 [default=32];
// Data augmentation options.
......@@ -78,6 +81,10 @@ message TrainConfig {
// Note that only Sigmoid classification losses should be used.
optional bool merge_multiple_label_boxes = 17 [default=false];
// If true, will use multiclass scores from object annotations as ground
// truth. Currently only compatible with annotated image inputs.
optional bool use_multiclass_scores = 24 [default = false];
// Whether to add regularization loss to `total_loss`. This is true by
// default and adds all regularization losses defined in the model to
// `total_loss`.
......
......@@ -98,6 +98,7 @@ model {
epsilon: 0.001,
}
}
override_base_feature_extractor_hyperparams: true
}
loss {
classification_loss {
......
......@@ -98,6 +98,7 @@ model {
epsilon: 0.001,
}
}
override_base_feature_extractor_hyperparams: true
}
loss {
classification_loss {
......
......@@ -98,6 +98,7 @@ model {
epsilon: 0.01,
}
}
override_base_feature_extractor_hyperparams: true
}
loss {
classification_loss {
......
......@@ -69,10 +69,13 @@ def create_input_queue(batch_size_per_clone, create_tensor_dict_fn,
in tensor_dict)
include_keypoints = (fields.InputDataFields.groundtruth_keypoints
in tensor_dict)
include_multiclass_scores = (fields.InputDataFields.multiclass_scores
in tensor_dict)
if data_augmentation_options:
tensor_dict = preprocessor.preprocess(
tensor_dict, data_augmentation_options,
func_arg_map=preprocessor.get_default_func_arg_map(
include_multiclass_scores=include_multiclass_scores,
include_instance_masks=include_instance_masks,
include_keypoints=include_keypoints))
......@@ -85,7 +88,10 @@ def create_input_queue(batch_size_per_clone, create_tensor_dict_fn,
return input_queue
def get_inputs(input_queue, num_classes, merge_multiple_label_boxes=False):
def get_inputs(input_queue,
num_classes,
merge_multiple_label_boxes=False,
use_multiclass_scores=False):
"""Dequeues batch and constructs inputs to object detection model.
Args:
......@@ -95,6 +101,8 @@ def get_inputs(input_queue, num_classes, merge_multiple_label_boxes=False):
or not. Defaults to false. Merged boxes are represented with a single
box and a k-hot encoding of the multiple labels associated with the
boxes.
use_multiclass_scores: Whether to use multiclass scores instead of
groundtruth_classes.
Returns:
images: a list of 3-D float tensor of images.
......@@ -123,9 +131,19 @@ def get_inputs(input_queue, num_classes, merge_multiple_label_boxes=False):
classes_gt = tf.cast(read_data[fields.InputDataFields.groundtruth_classes],
tf.int32)
classes_gt -= label_id_offset
if merge_multiple_label_boxes and use_multiclass_scores:
raise ValueError(
'Using both merge_multiple_label_boxes and use_multiclass_scores is'
'not supported'
)
if merge_multiple_label_boxes:
location_gt, classes_gt, _ = util_ops.merge_boxes_with_multiple_labels(
location_gt, classes_gt, num_classes)
elif use_multiclass_scores:
classes_gt = tf.cast(read_data[fields.InputDataFields.multiclass_scores],
tf.float32)
else:
classes_gt = util_ops.padded_one_hot_encoding(
indices=classes_gt, depth=num_classes, left_pad=0)
......@@ -155,7 +173,8 @@ def _create_losses(input_queue, create_model_fn, train_config):
groundtruth_masks_list, groundtruth_keypoints_list, _) = get_inputs(
input_queue,
detection_model.num_classes,
train_config.merge_multiple_label_boxes)
train_config.merge_multiple_label_boxes,
train_config.use_multiclass_scores)
preprocessed_images = []
true_image_shapes = []
......@@ -183,9 +202,19 @@ def _create_losses(input_queue, create_model_fn, train_config):
tf.losses.add_loss(loss_tensor)
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task,
num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name,
is_chief, train_dir, graph_hook_fn=None):
def train(create_tensor_dict_fn,
create_model_fn,
train_config,
master,
task,
num_clones,
worker_replicas,
clone_on_cpu,
ps_tasks,
worker_job_name,
is_chief,
train_dir,
graph_hook_fn=None):
"""Training function for detection models.
Args:
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment