Unverified Commit 58856e2b authored by Menglong Zhu's avatar Menglong Zhu Committed by GitHub
Browse files

Merged commit includes the following changes: (#6726)

246873701  by menglong:

    Missing __init__.py under meta_architectures/

--
246857392  by menglong:

    Standardize proto namespace: lstm_object_detection.protos

--
246625127  by menglong:

    Internal changes.

--
246596481  by menglong:

    Add License

--
246580605  by menglong:

    Internal changes

--
246344626  by menglong:

    Open source interleaved mobilenet v2 model.

--
244893883  by menglong:

    Introduce multi_input_decoder for interleaved model.

--
244461016  by menglong:

    Add pre-bottleneck operation to lstm cells to support interleaved model.

--
244052176  by menglong:

    Update README

--
244020495  by menglong:

    Add test to rnn_decoder.

--
243704250  by menglong:

    Duplicate assignment.

--
243091836  by menglong:

    Move LSTMSSD meta arch into separate folder

--
242900337  by menglong:

    Modified mobilenet definition for LSTM-SSD

--
242773195  by menglong:

    Release GroupedConvLSTMCell implementation: https://arxiv.org/abs/1903.10172

--
242574736  by menglong:

    Introduce module for quantizated training.

--
242544306  by menglong:

    lstm_ssd_meta_arch updates, added test
    rename:
    - LSTMMetaArch to LSTMSSDMetaArch
    - LSTMFeatureExtractor to LSTMSSDFeatureExtractor

--
241986236  by menglong:

    Move lstm quantization utils to 3rd party.

--
225922488  by yinxiao:

    Training pipeline fixes.

--
224839137  by yinxiao:

    Issue fix for lstm object detecion sample config.

--
224246947  by menglong:

    Fix logging module import

--

PiperOrigin-RevId: 246873701
parent f5073f49
...@@ -13,17 +13,21 @@ ...@@ -13,17 +13,21 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""LSTM Meta-architecture definition. """LSTM SSD Meta-architecture definition.
General tensorflow implementation of convolutional Multibox/SSD detection General tensorflow implementation of convolutional Multibox/SSD detection
models with LSTM states, for use on video data. models with LSTM states, for use on video data. This implementation supports
both regular LSTM-SSD and interleaved LSTM-SSD framework.
See https://arxiv.org/abs/1711.06368 for details. See https://arxiv.org/abs/1711.06368 and https://arxiv.org/abs/1903.10172
for details.
""" """
import abc
import re import re
import tensorflow as tf import tensorflow as tf
from object_detection.core import box_list_ops from object_detection.core import box_list_ops
from object_detection.core import matcher
from object_detection.core import standard_fields as fields from object_detection.core import standard_fields as fields
from object_detection.meta_architectures import ssd_meta_arch from object_detection.meta_architectures import ssd_meta_arch
from object_detection.utils import ops from object_detection.utils import ops
...@@ -32,7 +36,7 @@ from object_detection.utils import shape_utils ...@@ -32,7 +36,7 @@ from object_detection.utils import shape_utils
slim = tf.contrib.slim slim = tf.contrib.slim
class LSTMMetaArch(ssd_meta_arch.SSDMetaArch): class LSTMSSDMetaArch(ssd_meta_arch.SSDMetaArch):
"""LSTM Meta-architecture definition.""" """LSTM Meta-architecture definition."""
def __init__(self, def __init__(self,
...@@ -54,7 +58,7 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch): ...@@ -54,7 +58,7 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
unroll_length, unroll_length,
target_assigner_instance, target_assigner_instance,
add_summaries=True): add_summaries=True):
super(LSTMMetaArch, self).__init__( super(LSTMSSDMetaArch, self).__init__(
is_training=is_training, is_training=is_training,
anchor_generator=anchor_generator, anchor_generator=anchor_generator,
box_predictor=box_predictor, box_predictor=box_predictor,
...@@ -94,26 +98,19 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch): ...@@ -94,26 +98,19 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
preprocessed_inputs) preprocessed_inputs)
self._batch_size = preprocessed_inputs.shape[0].value / self._unroll_length self._batch_size = preprocessed_inputs.shape[0].value / self._unroll_length
self._states = states self._states = states
self._anchors = box_list_ops.concatenate( anchors = self._anchor_generator.generate(feature_map_spatial_dims,
self._anchor_generator.generate( im_height=image_shape[1],
feature_map_spatial_dims, im_width=image_shape[2])
im_height=image_shape[1], with tf.variable_scope('MultipleGridAnchorGenerator', reuse=tf.AUTO_REUSE):
im_width=image_shape[2])) self._anchors = box_list_ops.concatenate(anchors)
prediction_dict = self._box_predictor.predict( prediction_dict = self._box_predictor.predict(
feature_maps, self._anchor_generator.num_anchors_per_location()) feature_maps, self._anchor_generator.num_anchors_per_location())
with tf.variable_scope('Loss', reuse=tf.AUTO_REUSE):
# Multiscale_anchor_generator currently has a different dim compared to
# ssd_anchor_generator. Current fix is to check the dim of the box_encodings
# tensor. If dim is not 3(multiscale_anchor_generator), squeeze the 3rd dim.
# TODO(yinxiao): Remove this check once the anchor generator has unified
# dimension.
if len(prediction_dict['box_encodings'][0].get_shape().as_list()) == 3:
box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1) box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1)
else: if box_encodings.shape.ndims == 4 and box_encodings.shape[2] == 1:
box_encodings = tf.squeeze( box_encodings = tf.squeeze(box_encodings, axis=2)
tf.concat(prediction_dict['box_encodings'], axis=1), axis=2) class_predictions_with_background = tf.concat(
class_predictions_with_background = tf.concat( prediction_dict['class_predictions_with_background'], axis=1)
prediction_dict['class_predictions_with_background'], axis=1)
predictions_dict = { predictions_dict = {
'preprocessed_inputs': preprocessed_inputs, 'preprocessed_inputs': preprocessed_inputs,
'box_encodings': box_encodings, 'box_encodings': box_encodings,
...@@ -161,10 +158,11 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch): ...@@ -161,10 +158,11 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
if self.groundtruth_has_field(fields.BoxListFields.weights): if self.groundtruth_has_field(fields.BoxListFields.weights):
weights = self.groundtruth_lists(fields.BoxListFields.weights) weights = self.groundtruth_lists(fields.BoxListFields.weights)
(batch_cls_targets, batch_cls_weights, batch_reg_targets, (batch_cls_targets, batch_cls_weights, batch_reg_targets,
batch_reg_weights, match_list) = self._assign_targets( batch_reg_weights, batch_match) = self._assign_targets(
self.groundtruth_lists(fields.BoxListFields.boxes), self.groundtruth_lists(fields.BoxListFields.boxes),
self.groundtruth_lists(fields.BoxListFields.classes), self.groundtruth_lists(fields.BoxListFields.classes),
keypoints, weights) keypoints, weights)
match_list = [matcher.Match(match) for match in tf.unstack(batch_match)]
if self._add_summaries: if self._add_summaries:
self._summarize_target_assignment( self._summarize_target_assignment(
self.groundtruth_lists(fields.BoxListFields.boxes), match_list) self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
...@@ -275,8 +273,18 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch): ...@@ -275,8 +273,18 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
return self._feature_extractor.get_base_network_scope() return self._feature_extractor.get_base_network_scope()
class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): class LSTMSSDFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
"""LSTM Meta-architecture Feature Extractor definition.""" """LSTM SSD Meta-architecture Feature Extractor definition."""
__metaclass__ = abc.ABCMeta
@property
def clip_state(self):
return self._clip_state
@clip_state.setter
def clip_state(self, clip_state):
self._clip_state = clip_state
@property @property
def depth_multipliers(self): def depth_multipliers(self):
...@@ -294,6 +302,18 @@ class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -294,6 +302,18 @@ class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
def lstm_state_depth(self, lstm_state_depth): def lstm_state_depth(self, lstm_state_depth):
self._lstm_state_depth = lstm_state_depth self._lstm_state_depth = lstm_state_depth
@property
def is_quantized(self):
return self._is_quantized
@is_quantized.setter
def is_quantized(self, is_quantized):
self._is_quantized = is_quantized
@property
def interleaved(self):
return False
@property @property
def states_and_outputs(self): def states_and_outputs(self):
"""LSTM states and outputs. """LSTM states and outputs.
...@@ -332,3 +352,81 @@ class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): ...@@ -332,3 +352,81 @@ class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
The variable scope of the base network, e.g. MobilenetV1 The variable scope of the base network, e.g. MobilenetV1
""" """
return self._base_network_scope return self._base_network_scope
@abc.abstractmethod
def create_lstm_cell(self, batch_size, output_size, state_saver, state_name):
"""Create the LSTM cell, and initialize state if necessary.
Args:
batch_size: input batch size.
output_size: output size of the lstm cell, [width, height].
state_saver: a state saver object with methods `state` and `save_state`.
state_name: string, the name to use with the state_saver.
Returns:
lstm_cell: the lstm cell unit.
init_state: initial state representations.
step: the step
"""
pass
class LSTMSSDInterleavedFeatureExtractor(LSTMSSDFeatureExtractor):
"""LSTM SSD Meta-architecture Interleaved Feature Extractor definition."""
__metaclass__ = abc.ABCMeta
@property
def pre_bottleneck(self):
return self._pre_bottleneck
@pre_bottleneck.setter
def pre_bottleneck(self, pre_bottleneck):
self._pre_bottleneck = pre_bottleneck
@property
def low_res(self):
return self._low_res
@low_res.setter
def low_res(self, low_res):
self._low_res = low_res
@property
def interleaved(self):
return True
@property
def interleave_method(self):
return self._interleave_method
@interleave_method.setter
def interleave_method(self, interleave_method):
self._interleave_method = interleave_method
@abc.abstractmethod
def extract_base_features_large(self, preprocessed_inputs):
"""Extract the large base model features.
Args:
preprocessed_inputs: preprocessed input images of shape:
[batch, width, height, depth].
Returns:
net: the last feature map created from the base feature extractor.
end_points: a dictionary of feature maps created.
"""
pass
@abc.abstractmethod
def extract_base_features_small(self, preprocessed_inputs):
"""Extract the small base model features.
Args:
preprocessed_inputs: preprocessed input images of shape:
[batch, width, height, depth].
Returns:
net: the last feature map created from the base feature extractor.
end_points: a dictionary of feature maps created.
"""
pass
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for meta_architectures.lstm_ssd_meta_arch."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import numpy as np
import tensorflow as tf
from lstm_object_detection.lstm import lstm_cells
from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
from object_detection.core import anchor_generator
from object_detection.core import box_list
from object_detection.core import losses
from object_detection.core import post_processing
from object_detection.core import region_similarity_calculator as sim_calc
from object_detection.core import standard_fields as fields
from object_detection.core import target_assigner
from object_detection.models import feature_map_generators
from object_detection.utils import test_case
from object_detection.utils import test_utils
slim = tf.contrib.slim
MAX_TOTAL_NUM_BOXES = 5
NUM_CLASSES = 1
class FakeLSTMFeatureExtractor(
lstm_ssd_meta_arch.LSTMSSDFeatureExtractor):
def __init__(self):
super(FakeLSTMFeatureExtractor, self).__init__(
is_training=True,
depth_multiplier=1.0,
min_depth=0,
pad_to_multiple=1,
conv_hyperparams_fn=self.scope_fn)
self._lstm_state_depth = 256
def scope_fn(self):
with slim.arg_scope([slim.conv2d], activation_fn=tf.nn.relu6) as sc:
return sc
def create_lstm_cell(self):
pass
def extract_features(self, preprocessed_inputs, state_saver=None,
state_name='lstm_state', unroll_length=5, scope=None):
with tf.variable_scope('mock_model'):
net = slim.conv2d(inputs=preprocessed_inputs, num_outputs=32,
kernel_size=1, scope='layer1')
image_features = {'last_layer': net}
self._states_out = {}
feature_map_layout = {
'from_layer': ['last_layer'],
'layer_depth': [-1],
'use_explicit_padding': self._use_explicit_padding,
'use_depthwise': self._use_depthwise,
}
feature_maps = feature_map_generators.multi_resolution_feature_maps(
feature_map_layout=feature_map_layout,
depth_multiplier=(self._depth_multiplier),
min_depth=self._min_depth,
insert_1x1_conv=True,
image_features=image_features)
return feature_maps.values()
class FakeLSTMInterleavedFeatureExtractor(
lstm_ssd_meta_arch.LSTMSSDInterleavedFeatureExtractor):
def __init__(self):
super(FakeLSTMInterleavedFeatureExtractor, self).__init__(
is_training=True,
depth_multiplier=1.0,
min_depth=0,
pad_to_multiple=1,
conv_hyperparams_fn=self.scope_fn)
self._lstm_state_depth = 256
def scope_fn(self):
with slim.arg_scope([slim.conv2d], activation_fn=tf.nn.relu6) as sc:
return sc
def create_lstm_cell(self):
pass
def extract_base_features_large(self, preprocessed_inputs):
with tf.variable_scope('base_large'):
net = slim.conv2d(inputs=preprocessed_inputs, num_outputs=32,
kernel_size=1, scope='layer1')
return net
def extract_base_features_small(self, preprocessed_inputs):
with tf.variable_scope('base_small'):
net = slim.conv2d(inputs=preprocessed_inputs, num_outputs=32,
kernel_size=1, scope='layer1')
return net
def extract_features(self, preprocessed_inputs, state_saver=None,
state_name='lstm_state', unroll_length=5, scope=None):
with tf.variable_scope('mock_model'):
net_large = self.extract_base_features_large(preprocessed_inputs)
net_small = self.extract_base_features_small(preprocessed_inputs)
net = slim.conv2d(
inputs=tf.concat([net_large, net_small], axis=3),
num_outputs=32,
kernel_size=1,
scope='layer1')
image_features = {'last_layer': net}
self._states_out = {}
feature_map_layout = {
'from_layer': ['last_layer'],
'layer_depth': [-1],
'use_explicit_padding': self._use_explicit_padding,
'use_depthwise': self._use_depthwise,
}
feature_maps = feature_map_generators.multi_resolution_feature_maps(
feature_map_layout=feature_map_layout,
depth_multiplier=(self._depth_multiplier),
min_depth=self._min_depth,
insert_1x1_conv=True,
image_features=image_features)
return feature_maps.values()
class MockAnchorGenerator2x2(anchor_generator.AnchorGenerator):
"""Sets up a simple 2x2 anchor grid on the unit square."""
def name_scope(self):
return 'MockAnchorGenerator'
def num_anchors_per_location(self):
return [1]
def _generate(self, feature_map_shape_list, im_height, im_width):
return [box_list.BoxList(
tf.constant([[0, 0, .5, .5],
[0, .5, .5, 1],
[.5, 0, 1, .5],
[1., 1., 1.5, 1.5] # Anchor that is outside clip_window.
], tf.float32))]
def num_anchors(self):
return 4
class LSTMSSDMetaArchTest(test_case.TestCase):
def _create_model(self,
interleaved=False,
apply_hard_mining=True,
normalize_loc_loss_by_codesize=False,
add_background_class=True,
random_example_sampling=False,
use_expected_classification_loss_under_sampling=False,
min_num_negative_samples=1,
desired_negative_sampling_ratio=3,
unroll_length=1):
num_classes = NUM_CLASSES
is_training = False
mock_anchor_generator = MockAnchorGenerator2x2()
mock_box_predictor = test_utils.MockBoxPredictor(is_training, num_classes)
mock_box_coder = test_utils.MockBoxCoder()
if interleaved:
fake_feature_extractor = FakeLSTMInterleavedFeatureExtractor()
else:
fake_feature_extractor = FakeLSTMFeatureExtractor()
mock_matcher = test_utils.MockMatcher()
region_similarity_calculator = sim_calc.IouSimilarity()
encode_background_as_zeros = False
def image_resizer_fn(image):
return [tf.identity(image), tf.shape(image)]
classification_loss = losses.WeightedSigmoidClassificationLoss()
localization_loss = losses.WeightedSmoothL1LocalizationLoss()
non_max_suppression_fn = functools.partial(
post_processing.batch_multiclass_non_max_suppression,
score_thresh=-20.0,
iou_thresh=1.0,
max_size_per_class=5,
max_total_size=MAX_TOTAL_NUM_BOXES)
classification_loss_weight = 1.0
localization_loss_weight = 1.0
negative_class_weight = 1.0
normalize_loss_by_num_matches = False
hard_example_miner = None
if apply_hard_mining:
# This hard example miner is expected to be a no-op.
hard_example_miner = losses.HardExampleMiner(
num_hard_examples=None,
iou_threshold=1.0)
target_assigner_instance = target_assigner.TargetAssigner(
region_similarity_calculator,
mock_matcher,
mock_box_coder,
negative_class_weight=negative_class_weight)
code_size = 4
model = lstm_ssd_meta_arch.LSTMSSDMetaArch(
is_training=is_training,
anchor_generator=mock_anchor_generator,
box_predictor=mock_box_predictor,
box_coder=mock_box_coder,
feature_extractor=fake_feature_extractor,
encode_background_as_zeros=encode_background_as_zeros,
image_resizer_fn=image_resizer_fn,
non_max_suppression_fn=non_max_suppression_fn,
score_conversion_fn=tf.identity,
classification_loss=classification_loss,
localization_loss=localization_loss,
classification_loss_weight=classification_loss_weight,
localization_loss_weight=localization_loss_weight,
normalize_loss_by_num_matches=normalize_loss_by_num_matches,
hard_example_miner=hard_example_miner,
unroll_length=unroll_length,
target_assigner_instance=target_assigner_instance,
add_summaries=False)
return model, num_classes, mock_anchor_generator.num_anchors(), code_size
def _get_value_for_matching_key(self, dictionary, suffix):
for key in dictionary.keys():
if key.endswith(suffix):
return dictionary[key]
raise ValueError('key not found {}'.format(suffix))
def test_predict_returns_correct_items_and_sizes(self):
batch_size = 3
height = width = 2
num_unroll = 1
graph = tf.Graph()
with graph.as_default():
model, num_classes, num_anchors, code_size = self._create_model()
preprocessed_images = tf.random_uniform(
[batch_size * num_unroll, height, width, 3],
minval=-1.,
maxval=1.)
true_image_shapes = tf.tile(
[[height, width, 3]], [batch_size, 1])
prediction_dict = model.predict(preprocessed_images, true_image_shapes)
self.assertIn('preprocessed_inputs', prediction_dict)
self.assertIn('box_encodings', prediction_dict)
self.assertIn('class_predictions_with_background', prediction_dict)
self.assertIn('feature_maps', prediction_dict)
self.assertIn('anchors', prediction_dict)
self.assertAllEqual(
[batch_size * num_unroll, height, width, 3],
prediction_dict['preprocessed_inputs'].shape.as_list())
self.assertAllEqual(
[batch_size * num_unroll, num_anchors, code_size],
prediction_dict['box_encodings'].shape.as_list())
self.assertAllEqual(
[batch_size * num_unroll, num_anchors, num_classes + 1],
prediction_dict['class_predictions_with_background'].shape.as_list())
self.assertAllEqual(
[num_anchors, code_size],
prediction_dict['anchors'].shape.as_list())
def test_interleaved_predict_returns_correct_items_and_sizes(self):
batch_size = 3
height = width = 2
num_unroll = 1
graph = tf.Graph()
with graph.as_default():
model, num_classes, num_anchors, code_size = self._create_model(
interleaved=True)
preprocessed_images = tf.random_uniform(
[batch_size * num_unroll, height, width, 3],
minval=-1.,
maxval=1.)
true_image_shapes = tf.tile(
[[height, width, 3]], [batch_size, 1])
prediction_dict = model.predict(preprocessed_images, true_image_shapes)
self.assertIn('preprocessed_inputs', prediction_dict)
self.assertIn('box_encodings', prediction_dict)
self.assertIn('class_predictions_with_background', prediction_dict)
self.assertIn('feature_maps', prediction_dict)
self.assertIn('anchors', prediction_dict)
self.assertAllEqual(
[batch_size * num_unroll, height, width, 3],
prediction_dict['preprocessed_inputs'].shape.as_list())
self.assertAllEqual(
[batch_size * num_unroll, num_anchors, code_size],
prediction_dict['box_encodings'].shape.as_list())
self.assertAllEqual(
[batch_size * num_unroll, num_anchors, num_classes + 1],
prediction_dict['class_predictions_with_background'].shape.as_list())
self.assertAllEqual(
[num_anchors, code_size],
prediction_dict['anchors'].shape.as_list())
if __name__ == '__main__':
tf.test.main()
...@@ -14,8 +14,9 @@ ...@@ -14,8 +14,9 @@
# ============================================================================== # ==============================================================================
"""A function to build a DetectionModel from configuration.""" """A function to build a DetectionModel from configuration."""
from lstm_object_detection.lstm import lstm_meta_arch from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
from lstm_object_detection.models.lstm_ssd_mobilenet_v1_feature_extractor import LSTMMobileNetV1FeatureExtractor from lstm_object_detection.models import lstm_ssd_interleaved_mobilenet_v2_feature_extractor
from lstm_object_detection.models import lstm_ssd_mobilenet_v1_feature_extractor
from object_detection.builders import anchor_generator_builder from object_detection.builders import anchor_generator_builder
from object_detection.builders import box_coder_builder from object_detection.builders import box_coder_builder
from object_detection.builders import box_predictor_builder from object_detection.builders import box_predictor_builder
...@@ -29,7 +30,12 @@ from object_detection.builders import region_similarity_calculator_builder as si ...@@ -29,7 +30,12 @@ from object_detection.builders import region_similarity_calculator_builder as si
from object_detection.core import target_assigner from object_detection.core import target_assigner
model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP.update({ model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP.update({
'lstm_mobilenet_v1': LSTMMobileNetV1FeatureExtractor, 'lstm_ssd_mobilenet_v1':
lstm_ssd_mobilenet_v1_feature_extractor
.LSTMSSDMobileNetV1FeatureExtractor,
'lstm_ssd_interleaved_mobilenet_v2':
lstm_ssd_interleaved_mobilenet_v2_feature_extractor
.LSTMSSDInterleavedMobilenetV2FeatureExtractor,
}) })
SSD_FEATURE_EXTRACTOR_CLASS_MAP = model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP SSD_FEATURE_EXTRACTOR_CLASS_MAP = model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP
...@@ -54,14 +60,14 @@ def build(model_config, lstm_config, is_training): ...@@ -54,14 +60,14 @@ def build(model_config, lstm_config, is_training):
def _build_lstm_feature_extractor(feature_extractor_config, def _build_lstm_feature_extractor(feature_extractor_config,
is_training, is_training,
lstm_state_depth, lstm_config,
reuse_weights=None): reuse_weights=None):
"""Builds a ssd_meta_arch.SSDFeatureExtractor based on config. """Builds a ssd_meta_arch.SSDFeatureExtractor based on config.
Args: Args:
feature_extractor_config: A SSDFeatureExtractor proto config from ssd.proto. feature_extractor_config: A SSDFeatureExtractor proto config from ssd.proto.
is_training: True if this feature extractor is being built for training. is_training: True if this feature extractor is being built for training.
lstm_state_depth: An integer of the depth of the lstm state. lstm_config: LSTM-SSD specific configs.
reuse_weights: If the feature extractor should reuse weights. reuse_weights: If the feature extractor should reuse weights.
Returns: Returns:
...@@ -86,10 +92,27 @@ def _build_lstm_feature_extractor(feature_extractor_config, ...@@ -86,10 +92,27 @@ def _build_lstm_feature_extractor(feature_extractor_config,
raise ValueError('Unknown ssd feature_extractor: {}'.format(feature_type)) raise ValueError('Unknown ssd feature_extractor: {}'.format(feature_type))
feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type] feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type]
return feature_extractor_class( feature_extractor = feature_extractor_class(
is_training, depth_multiplier, min_depth, pad_to_multiple, is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise, conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise,
override_base_feature_extractor_hyperparams, lstm_state_depth) override_base_feature_extractor_hyperparams)
# Extra configs for LSTM-SSD.
feature_extractor.lstm_state_depth = lstm_config.lstm_state_depth
feature_extractor.flatten_state = lstm_config.flatten_state
feature_extractor.clip_state = lstm_config.clip_state
feature_extractor.scale_state = lstm_config.scale_state
feature_extractor.is_quantized = lstm_config.is_quantized
feature_extractor.low_res = lstm_config.low_res
# Extra configs for interleaved LSTM-SSD.
if 'interleaved' in feature_extractor_config.type:
feature_extractor.pre_bottleneck = lstm_config.pre_bottleneck
feature_extractor.depth_multipliers = lstm_config.depth_multipliers
if is_training:
feature_extractor.interleave_method = lstm_config.train_interleave_method
else:
feature_extractor.interleave_method = lstm_config.eval_interleave_method
return feature_extractor
def _build_lstm_model(ssd_config, lstm_config, is_training): def _build_lstm_model(ssd_config, lstm_config, is_training):
...@@ -97,19 +120,19 @@ def _build_lstm_model(ssd_config, lstm_config, is_training): ...@@ -97,19 +120,19 @@ def _build_lstm_model(ssd_config, lstm_config, is_training):
Args: Args:
ssd_config: A ssd.proto object containing the config for the desired ssd_config: A ssd.proto object containing the config for the desired
LSTMMetaArch. LSTMSSDMetaArch.
lstm_config: LstmModel config proto that specifies LSTM train/eval configs. lstm_config: LstmModel config proto that specifies LSTM train/eval configs.
is_training: True if this model is being built for training purposes. is_training: True if this model is being built for training purposes.
Returns: Returns:
LSTMMetaArch based on the config. LSTMSSDMetaArch based on the config.
Raises: Raises:
ValueError: If ssd_config.type is not recognized (i.e. not registered in ValueError: If ssd_config.type is not recognized (i.e. not registered in
model_class_map), or if lstm_config.interleave_strategy is not recognized. model_class_map), or if lstm_config.interleave_strategy is not recognized.
ValueError: If unroll_length is not specified in the config file. ValueError: If unroll_length is not specified in the config file.
""" """
feature_extractor = _build_lstm_feature_extractor( feature_extractor = _build_lstm_feature_extractor(
ssd_config.feature_extractor, is_training, lstm_config.lstm_state_depth) ssd_config.feature_extractor, is_training, lstm_config)
box_coder = box_coder_builder.build(ssd_config.box_coder) box_coder = box_coder_builder.build(ssd_config.box_coder)
matcher = matcher_builder.build(ssd_config.matcher) matcher = matcher_builder.build(ssd_config.matcher)
...@@ -147,7 +170,7 @@ def _build_lstm_model(ssd_config, lstm_config, is_training): ...@@ -147,7 +170,7 @@ def _build_lstm_model(ssd_config, lstm_config, is_training):
box_coder, box_coder,
negative_class_weight=negative_class_weight) negative_class_weight=negative_class_weight)
lstm_model = lstm_meta_arch.LSTMMetaArch( lstm_model = lstm_ssd_meta_arch.LSTMSSDMetaArch(
is_training=is_training, is_training=is_training,
anchor_generator=anchor_generator, anchor_generator=anchor_generator,
box_predictor=ssd_box_predictor, box_predictor=ssd_box_predictor,
......
...@@ -13,19 +13,19 @@ ...@@ -13,19 +13,19 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Tests for video_object_detection.tensorflow.model_builder.""" """Tests for lstm_object_detection.tensorflow.model_builder."""
import tensorflow as tf import tensorflow as tf
from google.protobuf import text_format from google.protobuf import text_format
from lstm_object_detection import model_builder from lstm_object_detection import model_builder
from lstm_object_detection.lstm import lstm_meta_arch from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
from lstm_object_detection.protos import pipeline_pb2 as internal_pipeline_pb2 from lstm_object_detection.protos import pipeline_pb2 as internal_pipeline_pb2
from object_detection.protos import pipeline_pb2 from object_detection.protos import pipeline_pb2
class ModelBuilderTest(tf.test.TestCase): class ModelBuilderTest(tf.test.TestCase):
def create_model(self, model_config, lstm_config): def create_train_model(self, model_config, lstm_config):
"""Builds a DetectionModel based on the model config. """Builds a DetectionModel based on the model config.
Args: Args:
...@@ -39,6 +39,20 @@ class ModelBuilderTest(tf.test.TestCase): ...@@ -39,6 +39,20 @@ class ModelBuilderTest(tf.test.TestCase):
""" """
return model_builder.build(model_config, lstm_config, is_training=True) return model_builder.build(model_config, lstm_config, is_training=True)
def create_eval_model(self, model_config, lstm_config):
"""Builds a DetectionModel based on the model config.
Args:
model_config: A model.proto object containing the config for the desired
DetectionModel.
lstm_config: LstmModel config proto that specifies LSTM train/eval
configs.
Returns:
DetectionModel based on the config.
"""
return model_builder.build(model_config, lstm_config, is_training=False)
def get_model_configs_from_proto(self): def get_model_configs_from_proto(self):
"""Creates a model text proto for testing. """Creates a model text proto for testing.
...@@ -47,14 +61,110 @@ class ModelBuilderTest(tf.test.TestCase): ...@@ -47,14 +61,110 @@ class ModelBuilderTest(tf.test.TestCase):
""" """
model_text_proto = """ model_text_proto = """
[object_detection.protos.lstm_model] { [lstm_object_detection.protos.lstm_model] {
train_unroll_length: 4 train_unroll_length: 4
eval_unroll_length: 4 eval_unroll_length: 4
} }
model { model {
ssd { ssd {
feature_extractor { feature_extractor {
type: 'lstm_mobilenet_v1' type: 'lstm_ssd_mobilenet_v1'
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
}
negative_class_weight: 2.0
box_coder {
faster_rcnn_box_coder {
}
}
matcher {
argmax_matcher {
}
}
similarity_calculator {
iou_similarity {
}
}
anchor_generator {
ssd_anchor_generator {
aspect_ratios: 1.0
}
}
image_resizer {
fixed_shape_resizer {
height: 320
width: 320
}
}
box_predictor {
convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
}
}
normalize_loc_loss_by_codesize: true
loss {
classification_loss {
weighted_softmax {
}
}
localization_loss {
weighted_smooth_l1 {
}
}
}
}
}"""
pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
text_format.Merge(model_text_proto, pipeline_config)
configs = {}
configs['model'] = pipeline_config.model
configs['lstm_model'] = pipeline_config.Extensions[
internal_pipeline_pb2.lstm_model]
return configs
def get_interleaved_model_configs_from_proto(self):
"""Creates an interleaved model text proto for testing.
Returns:
A dictionary of model configs.
"""
model_text_proto = """
[lstm_object_detection.protos.lstm_model] {
train_unroll_length: 4
eval_unroll_length: 10
lstm_state_depth: 320
depth_multipliers: 1.4
depth_multipliers: 0.35
pre_bottleneck: true
low_res: true
train_interleave_method: 'RANDOM_SKIP_SMALL'
eval_interleave_method: 'SKIP3'
}
model {
ssd {
feature_extractor {
type: 'lstm_ssd_interleaved_mobilenet_v2'
conv_hyperparams { conv_hyperparams {
regularizer { regularizer {
l2_regularizer { l2_regularizer {
...@@ -134,24 +244,58 @@ class ModelBuilderTest(tf.test.TestCase): ...@@ -134,24 +244,58 @@ class ModelBuilderTest(tf.test.TestCase):
self.assertEqual(configs['model'].ssd.negative_class_weight, 2.0) self.assertEqual(configs['model'].ssd.negative_class_weight, 2.0)
self.assertTrue(configs['model'].ssd.normalize_loc_loss_by_codesize) self.assertTrue(configs['model'].ssd.normalize_loc_loss_by_codesize)
self.assertEqual(configs['model'].ssd.feature_extractor.type, self.assertEqual(configs['model'].ssd.feature_extractor.type,
'lstm_mobilenet_v1') 'lstm_ssd_mobilenet_v1')
model = self.create_model(configs['model'], configs['lstm_model']) model = self.create_train_model(configs['model'], configs['lstm_model'])
# Test architechture type. # Test architechture type.
self.assertIsInstance(model, lstm_meta_arch.LSTMMetaArch) self.assertIsInstance(model, lstm_ssd_meta_arch.LSTMSSDMetaArch)
# Test LSTM unroll length. # Test LSTM unroll length.
self.assertEqual(model.unroll_length, 4) self.assertEqual(model.unroll_length, 4)
model = self.create_eval_model(configs['model'], configs['lstm_model'])
# Test architechture type.
self.assertIsInstance(model, lstm_ssd_meta_arch.LSTMSSDMetaArch)
# Test LSTM configs.
self.assertEqual(model.unroll_length, 4)
def test_interleaved_model_creation_from_valid_configs(self):
configs = self.get_interleaved_model_configs_from_proto()
# Test model properties.
self.assertEqual(configs['model'].ssd.negative_class_weight, 2.0)
self.assertTrue(configs['model'].ssd.normalize_loc_loss_by_codesize)
self.assertEqual(configs['model'].ssd.feature_extractor.type,
'lstm_ssd_interleaved_mobilenet_v2')
model = self.create_train_model(configs['model'], configs['lstm_model'])
# Test architechture type.
self.assertIsInstance(model, lstm_ssd_meta_arch.LSTMSSDMetaArch)
# Test LSTM configs.
self.assertEqual(model.unroll_length, 4)
self.assertEqual(model._feature_extractor.lstm_state_depth, 320)
self.assertAllClose(model._feature_extractor.depth_multipliers, (1.4, 0.35))
self.assertTrue(model._feature_extractor.pre_bottleneck)
self.assertTrue(model._feature_extractor.low_res)
self.assertEqual(model._feature_extractor.interleave_method,
'RANDOM_SKIP_SMALL')
model = self.create_eval_model(configs['model'], configs['lstm_model'])
# Test architechture type.
self.assertIsInstance(model, lstm_ssd_meta_arch.LSTMSSDMetaArch)
# Test LSTM configs.
self.assertEqual(model.unroll_length, 10)
self.assertEqual(model._feature_extractor.lstm_state_depth, 320)
self.assertAllClose(model._feature_extractor.depth_multipliers, (1.4, 0.35))
self.assertTrue(model._feature_extractor.pre_bottleneck)
self.assertTrue(model._feature_extractor.low_res)
self.assertEqual(model._feature_extractor.interleave_method, 'SKIP3')
def test_model_creation_from_invalid_configs(self): def test_model_creation_from_invalid_configs(self):
configs = self.get_model_configs_from_proto() configs = self.get_model_configs_from_proto()
# Test model build failure with wrong input configs. # Test model build failure with wrong input configs.
with self.assertRaises(AttributeError): with self.assertRaises(AttributeError):
_ = self.create_model(configs['model'], configs['model']) _ = self.create_train_model(configs['model'], configs['model'])
with self.assertRaises(AttributeError):
# Test model builder failure with missing configs. _ = self.create_eval_model(configs['model'], configs['model'])
with self.assertRaises(TypeError):
# pylint: disable=no-value-for-parameter
_ = self.create_model(configs['lstm_model'])
if __name__ == '__main__': if __name__ == '__main__':
......
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""LSTDInterleavedFeatureExtractor which interleaves multiple MobileNet V2."""
import tensorflow as tf
from tensorflow.python.framework import ops as tf_ops
from lstm_object_detection.lstm import lstm_cells
from lstm_object_detection.lstm import rnn_decoder
from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
from lstm_object_detection.models import mobilenet_defs
from object_detection.models import feature_map_generators
from object_detection.utils import ops
from object_detection.utils import shape_utils
from nets.mobilenet import mobilenet
from nets.mobilenet import mobilenet_v2
slim = tf.contrib.slim
class LSTMSSDInterleavedMobilenetV2FeatureExtractor(
lstm_ssd_meta_arch.LSTMSSDInterleavedFeatureExtractor):
"""LSTM-SSD Interleaved Feature Extractor using MobilenetV2 features."""
def __init__(self,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams_fn,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=True,
override_base_feature_extractor_hyperparams=False):
"""Interleaved Feature Extractor for LSTD Models with MobileNet v2.
Args:
is_training: whether the network is in training mode.
depth_multiplier: float depth multiplier for feature extractor.
min_depth: minimum feature extractor depth.
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
use_depthwise: Whether to use depthwise convolutions. Default is True.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
"""
super(LSTMSSDInterleavedMobilenetV2FeatureExtractor, self).__init__(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams_fn, reuse_weights, use_explicit_padding, use_depthwise,
override_base_feature_extractor_hyperparams)
# RANDOM_SKIP_SMALL means the training policy is random and the small model
# does not update state during training.
if self._is_training:
self._interleave_method = 'RANDOM_SKIP_SMALL'
else:
self._interleave_method = 'SKIP9'
self._flatten_state = False
self._scale_state = False
self._clip_state = True
self._pre_bottleneck = True
self._feature_map_layout = {
'from_layer': ['layer_19', '', '', '', ''],
'layer_depth': [-1, 256, 256, 256, 256],
'use_depthwise': self._use_depthwise,
'use_explicit_padding': self._use_explicit_padding,
}
self._low_res = True
self._base_network_scope = 'MobilenetV2'
def extract_base_features_large(self, preprocessed_inputs):
"""Extract the large base model features.
Variables are created under the scope of <scope>/MobilenetV2_1/
Args:
preprocessed_inputs: preprocessed input images of shape:
[batch, width, height, depth].
Returns:
net: the last feature map created from the base feature extractor.
end_points: a dictionary of feature maps created.
"""
scope_name = self._base_network_scope + '_1'
with tf.variable_scope(scope_name, reuse=self._reuse_weights) as base_scope:
net, end_points = mobilenet_v2.mobilenet_base(
preprocessed_inputs,
depth_multiplier=self._depth_multipliers[0],
conv_defs=mobilenet_defs.mobilenet_v2_lite_def(
is_quantized=self._is_quantized),
use_explicit_padding=self._use_explicit_padding,
scope=base_scope)
return net, end_points
def extract_base_features_small(self, preprocessed_inputs):
"""Extract the small base model features.
Variables are created under the scope of <scope>/MobilenetV2_2/
Args:
preprocessed_inputs: preprocessed input images of shape:
[batch, width, height, depth].
Returns:
net: the last feature map created from the base feature extractor.
end_points: a dictionary of feature maps created.
"""
scope_name = self._base_network_scope + '_2'
with tf.variable_scope(scope_name, reuse=self._reuse_weights) as base_scope:
if self._low_res:
size_small = preprocessed_inputs.get_shape().as_list()[1] / 2
inputs_small = tf.image.resize_images(preprocessed_inputs,
[size_small, size_small])
# Create end point handle for tflite deployment.
with tf.name_scope(None):
inputs_small = tf.identity(
inputs_small, name='normalized_input_image_tensor_small')
else:
inputs_small = preprocessed_inputs
net, end_points = mobilenet_v2.mobilenet_base(
inputs_small,
depth_multiplier=self._depth_multipliers[1],
conv_defs=mobilenet_defs.mobilenet_v2_lite_def(
is_quantized=self._is_quantized, low_res=self._low_res),
use_explicit_padding=self._use_explicit_padding,
scope=base_scope)
return net, end_points
def create_lstm_cell(self, batch_size, output_size, state_saver, state_name):
"""Create the LSTM cell, and initialize state if necessary.
Args:
batch_size: input batch size.
output_size: output size of the lstm cell, [width, height].
state_saver: a state saver object with methods `state` and `save_state`.
state_name: string, the name to use with the state_saver.
Returns:
lstm_cell: the lstm cell unit.
init_state: initial state representations.
step: the step
"""
lstm_cell = lstm_cells.GroupedConvLSTMCell(
filter_size=(3, 3),
output_size=output_size,
num_units=max(self._min_depth, self._lstm_state_depth),
is_training=self._is_training,
activation=tf.nn.relu6,
flatten_state=self._flatten_state,
scale_state=self._scale_state,
clip_state=self._clip_state,
output_bottleneck=True,
pre_bottleneck=self._pre_bottleneck,
is_quantized=self._is_quantized,
visualize_gates=False)
if state_saver is None:
init_state = lstm_cell.init_state('lstm_state', batch_size, tf.float32)
step = None
else:
step = state_saver.state(state_name + '_step')
c = state_saver.state(state_name + '_c')
h = state_saver.state(state_name + '_h')
c.set_shape([batch_size] + c.get_shape().as_list()[1:])
h.set_shape([batch_size] + h.get_shape().as_list()[1:])
init_state = (c, h)
return lstm_cell, init_state, step
def extract_features(self, preprocessed_inputs, state_saver=None,
state_name='lstm_state', unroll_length=10, scope=None):
"""Extract features from preprocessed inputs.
The features include the base network features, lstm features and SSD
features, organized in the following name scope:
<scope>/MobilenetV2_1/...
<scope>/MobilenetV2_2/...
<scope>/LSTM/...
<scope>/FeatureMap/...
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of consecutive frames from video clips.
state_saver: A state saver object with methods `state` and `save_state`.
state_name: Python string, the name to use with the state_saver.
unroll_length: number of steps to unroll the lstm.
scope: Scope for the base network of the feature extractor.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
Raises:
ValueError: if interleave_method not recognized or large and small base
network output feature maps of different sizes.
"""
preprocessed_inputs = shape_utils.check_min_image_dim(
33, preprocessed_inputs)
preprocessed_inputs = ops.pad_to_multiple(
preprocessed_inputs, self._pad_to_multiple)
batch_size = preprocessed_inputs.shape[0].value / unroll_length
batch_axis = 0
nets = []
# Batch processing of mobilenet features.
with slim.arg_scope(mobilenet_v2.training_scope(
is_training=self._is_training,
bn_decay=0.9997)), \
slim.arg_scope([mobilenet.depth_multiplier],
min_depth=self._min_depth, divisible_by=8):
# Big model.
net, _ = self.extract_base_features_large(preprocessed_inputs)
nets.append(net)
large_base_feature_shape = net.shape
# Small models
net, _ = self.extract_base_features_small(preprocessed_inputs)
nets.append(net)
small_base_feature_shape = net.shape
if not (large_base_feature_shape[1] == small_base_feature_shape[1] and
large_base_feature_shape[2] == small_base_feature_shape[2]):
raise ValueError('Large and Small base network feature map dimension '
'not equal!')
with slim.arg_scope(self._conv_hyperparams_fn()):
with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope:
output_size = (large_base_feature_shape[1], large_base_feature_shape[2])
lstm_cell, init_state, step = self.create_lstm_cell(
batch_size, output_size, state_saver, state_name)
nets_seq = [
tf.split(net, unroll_length, axis=batch_axis) for net in nets
]
net_seq, states_out = rnn_decoder.multi_input_rnn_decoder(
nets_seq,
init_state,
lstm_cell,
step,
selection_strategy=self._interleave_method,
is_training=self._is_training,
pre_bottleneck=self._pre_bottleneck,
flatten_state=self._flatten_state,
scope=lstm_scope)
self._states_out = states_out
batcher_ops = None
if state_saver is not None:
self._step = state_saver.state(state_name + '_step')
batcher_ops = [
state_saver.save_state(state_name + '_c', states_out[-1][0]),
state_saver.save_state(state_name + '_h', states_out[-1][1]),
state_saver.save_state(state_name + '_step', self._step + 1)]
image_features = {}
with tf_ops.control_dependencies(batcher_ops):
image_features['layer_19'] = tf.concat(net_seq, 0)
# SSD layers.
with tf.variable_scope('FeatureMap'):
feature_maps = feature_map_generators.multi_resolution_feature_maps(
feature_map_layout=self._feature_map_layout,
depth_multiplier=self._depth_multiplier,
min_depth=self._min_depth,
insert_1x1_conv=True,
image_features=image_features,
pool_residual=True)
return feature_maps.values()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for lstm_ssd_interleaved_mobilenet_v2_feature_extractor."""
import itertools
import numpy as np
import tensorflow as tf
from lstm_object_detection.models import lstm_ssd_interleaved_mobilenet_v2_feature_extractor
from object_detection.models import ssd_feature_extractor_test
slim = tf.contrib.slim
class LSTMSSDInterleavedMobilenetV2FeatureExtractorTest(
ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
def _create_feature_extractor(self,
depth_multiplier,
pad_to_multiple,
is_quantized=False):
"""Constructs a new feature extractor.
Args:
depth_multiplier: float depth multiplier for feature extractor
pad_to_multiple: the nearest multiple to zero pad the input height and
width dimensions to.
is_quantized: whether to quantize the graph.
Returns:
an ssd_meta_arch.SSDFeatureExtractor object.
"""
min_depth = 32
def conv_hyperparams_fn():
with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm), \
slim.arg_scope([slim.batch_norm], is_training=False) as sc:
return sc
feature_extractor = (
lstm_ssd_interleaved_mobilenet_v2_feature_extractor
.LSTMSSDInterleavedMobilenetV2FeatureExtractor(False, depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams_fn))
feature_extractor.lstm_state_depth = int(320 * depth_multiplier)
feature_extractor.depth_multipliers = [
depth_multiplier, depth_multiplier / 4.0
]
feature_extractor.is_quantized = is_quantized
return feature_extractor
def test_extract_features_returns_correct_shapes_128(self):
image_height = 128
image_width = 128
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 4, 4, 640),
(2, 2, 2, 256), (2, 1, 1, 256),
(2, 1, 1, 256), (2, 1, 1, 256)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape)
def test_extract_features_returns_correct_shapes_unroll10(self):
image_height = 128
image_width = 128
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(10, 4, 4, 640),
(10, 2, 2, 256), (10, 1, 1, 256),
(10, 1, 1, 256), (10, 1, 1, 256)]
self.check_extract_features_returns_correct_shape(
10, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape, unroll_length=10)
def test_extract_features_returns_correct_shapes_320(self):
image_height = 320
image_width = 320
depth_multiplier = 1.0
pad_to_multiple = 1
expected_feature_map_shape = [(2, 10, 10, 640),
(2, 5, 5, 256), (2, 3, 3, 256),
(2, 2, 2, 256), (2, 1, 1, 256)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape)
def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
image_height = 320
image_width = 320
depth_multiplier = 0.5**12
pad_to_multiple = 1
expected_feature_map_shape = [(2, 10, 10, 64),
(2, 5, 5, 32), (2, 3, 3, 32),
(2, 2, 2, 32), (2, 1, 1, 32)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape)
def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
image_height = 299
image_width = 299
depth_multiplier = 1.0
pad_to_multiple = 32
expected_feature_map_shape = [(2, 10, 10, 640),
(2, 5, 5, 256), (2, 3, 3, 256),
(2, 2, 2, 256), (2, 1, 1, 256)]
self.check_extract_features_returns_correct_shape(
2, image_height, image_width, depth_multiplier, pad_to_multiple,
expected_feature_map_shape)
def test_preprocess_returns_correct_value_range(self):
image_height = 128
image_width = 128
depth_multiplier = 1
pad_to_multiple = 1
test_image = np.random.rand(4, image_height, image_width, 3)
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(test_image)
self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
def test_variables_only_created_in_scope(self):
depth_multiplier = 1
pad_to_multiple = 1
scope_names = ['MobilenetV2', 'LSTM', 'FeatureMap']
self.check_feature_extractor_variables_under_scopes(
depth_multiplier, pad_to_multiple, scope_names)
def test_has_fused_batchnorm(self):
image_height = 40
image_width = 40
depth_multiplier = 1
pad_to_multiple = 32
image_placeholder = tf.placeholder(tf.float32,
[1, image_height, image_width, 3])
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(image_placeholder)
_ = feature_extractor.extract_features(preprocessed_image, unroll_length=1)
self.assertTrue(any(op.type == 'FusedBatchNorm'
for op in tf.get_default_graph().get_operations()))
def test_variables_for_tflite(self):
image_height = 40
image_width = 40
depth_multiplier = 1
pad_to_multiple = 32
image_placeholder = tf.placeholder(tf.float32,
[1, image_height, image_width, 3])
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(image_placeholder)
tflite_unsupported = ['SquaredDifference']
_ = feature_extractor.extract_features(preprocessed_image, unroll_length=1)
self.assertFalse(any(op.type in tflite_unsupported
for op in tf.get_default_graph().get_operations()))
def test_output_nodes_for_tflite(self):
image_height = 64
image_width = 64
depth_multiplier = 1.0
pad_to_multiple = 1
image_placeholder = tf.placeholder(tf.float32,
[1, image_height, image_width, 3])
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
preprocessed_image = feature_extractor.preprocess(image_placeholder)
_ = feature_extractor.extract_features(preprocessed_image, unroll_length=1)
tflite_nodes = [
'raw_inputs/init_lstm_c',
'raw_inputs/init_lstm_h',
'raw_inputs/base_endpoint',
'raw_outputs/lstm_c',
'raw_outputs/lstm_h',
'raw_outputs/base_endpoint_1',
'raw_outputs/base_endpoint_2'
]
ops_names = [op.name for op in tf.get_default_graph().get_operations()]
for node in tflite_nodes:
self.assertTrue(any(node in s for s in ops_names))
def test_fixed_concat_nodes(self):
image_height = 64
image_width = 64
depth_multiplier = 1.0
pad_to_multiple = 1
image_placeholder = tf.placeholder(tf.float32,
[1, image_height, image_width, 3])
feature_extractor = self._create_feature_extractor(
depth_multiplier, pad_to_multiple, is_quantized=True)
preprocessed_image = feature_extractor.preprocess(image_placeholder)
_ = feature_extractor.extract_features(preprocessed_image, unroll_length=1)
concat_nodes = [
'MobilenetV2_1/expanded_conv_16/project/Relu6',
'MobilenetV2_2/expanded_conv_16/project/Relu6'
]
ops_names = [op.name for op in tf.get_default_graph().get_operations()]
for node in concat_nodes:
self.assertTrue(any(node in s for s in ops_names))
def test_lstm_states(self):
image_height = 256
image_width = 256
depth_multiplier = 1
pad_to_multiple = 1
state_channel = 320
init_state1 = {
'lstm_state_c': tf.zeros(
[image_height/32, image_width/32, state_channel]),
'lstm_state_h': tf.zeros(
[image_height/32, image_width/32, state_channel]),
'lstm_state_step': tf.zeros([1])
}
init_state2 = {
'lstm_state_c': tf.random_uniform(
[image_height/32, image_width/32, state_channel]),
'lstm_state_h': tf.random_uniform(
[image_height/32, image_width/32, state_channel]),
'lstm_state_step': tf.zeros([1])
}
seq = {'dummy': tf.random_uniform([2, 1, 1, 1])}
stateful_reader1 = tf.contrib.training.SequenceQueueingStateSaver(
batch_size=1, num_unroll=1, input_length=2, input_key='',
input_sequences=seq, input_context={}, initial_states=init_state1,
capacity=1)
stateful_reader2 = tf.contrib.training.SequenceQueueingStateSaver(
batch_size=1, num_unroll=1, input_length=2, input_key='',
input_sequences=seq, input_context={}, initial_states=init_state2,
capacity=1)
image = tf.random_uniform([1, image_height, image_width, 3])
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
with tf.variable_scope('zero_state'):
feature_maps1 = feature_extractor.extract_features(
image, stateful_reader1.next_batch, unroll_length=1)
with tf.variable_scope('random_state'):
feature_maps2 = feature_extractor.extract_features(
image, stateful_reader2.next_batch, unroll_length=1)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
sess.run(tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS))
sess.run([stateful_reader1.prefetch_op, stateful_reader2.prefetch_op])
maps1, maps2 = sess.run([feature_maps1, feature_maps2])
state = sess.run(stateful_reader1.next_batch.state('lstm_state_c'))
# feature maps should be different because states are different
self.assertFalse(np.all(np.equal(maps1[0], maps2[0])))
# state should no longer be zero after update
self.assertTrue(state.any())
def check_extract_features_returns_correct_shape(
self, batch_size, image_height, image_width, depth_multiplier,
pad_to_multiple, expected_feature_map_shapes, unroll_length=1):
def graph_fn(image_tensor):
feature_extractor = self._create_feature_extractor(depth_multiplier,
pad_to_multiple)
feature_maps = feature_extractor.extract_features(
image_tensor, unroll_length=unroll_length)
return feature_maps
image_tensor = np.random.rand(batch_size, image_height, image_width,
3).astype(np.float32)
feature_maps = self.execute(graph_fn, [image_tensor])
for feature_map, expected_shape in itertools.izip(
feature_maps, expected_feature_map_shapes):
self.assertAllEqual(feature_map.shape, expected_shape)
def check_feature_extractor_variables_under_scopes(
self, depth_multiplier, pad_to_multiple, scope_names):
g = tf.Graph()
with g.as_default():
feature_extractor = self._create_feature_extractor(
depth_multiplier, pad_to_multiple)
preprocessed_inputs = tf.placeholder(tf.float32, (4, 320, 320, 3))
feature_extractor.extract_features(
preprocessed_inputs, unroll_length=1)
variables = g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
for variable in variables:
self.assertTrue(
any([
variable.name.startswith(scope_name)
for scope_name in scope_names
]), 'Variable name: ' + variable.name +
' is not under any provided scopes: ' + ','.join(scope_names))
if __name__ == '__main__':
tf.test.main()
...@@ -13,13 +13,13 @@ ...@@ -13,13 +13,13 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""LSTMFeatureExtractor for MobilenetV1 features.""" """LSTMSSDFeatureExtractor for MobilenetV1 features."""
import tensorflow as tf import tensorflow as tf
from tensorflow.python.framework import ops as tf_ops from tensorflow.python.framework import ops as tf_ops
from lstm_object_detection.lstm import lstm_cells from lstm_object_detection.lstm import lstm_cells
from lstm_object_detection.lstm import lstm_meta_arch
from lstm_object_detection.lstm import rnn_decoder from lstm_object_detection.lstm import rnn_decoder
from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
from object_detection.models import feature_map_generators from object_detection.models import feature_map_generators
from object_detection.utils import context_manager from object_detection.utils import context_manager
from object_detection.utils import ops from object_detection.utils import ops
...@@ -29,7 +29,8 @@ from nets import mobilenet_v1 ...@@ -29,7 +29,8 @@ from nets import mobilenet_v1
slim = tf.contrib.slim slim = tf.contrib.slim
class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor): class LSTMSSDMobileNetV1FeatureExtractor(
lstm_ssd_meta_arch.LSTMSSDFeatureExtractor):
"""LSTM Feature Extractor using MobilenetV1 features.""" """LSTM Feature Extractor using MobilenetV1 features."""
def __init__(self, def __init__(self,
...@@ -37,13 +38,13 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor): ...@@ -37,13 +38,13 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
depth_multiplier, depth_multiplier,
min_depth, min_depth,
pad_to_multiple, pad_to_multiple,
conv_hyperparams, conv_hyperparams_fn,
reuse_weights=None, reuse_weights=None,
use_explicit_padding=False, use_explicit_padding=False,
use_depthwise=True, use_depthwise=True,
override_base_feature_extractor_hyperparams=False, override_base_feature_extractor_hyperparams=False,
lstm_state_depth=256): lstm_state_depth=256):
"""Initializes instance of MobileNetV1 Feature Extractor for LSTM Models. """Initializes instance of MobileNetV1 Feature Extractor for LSTMSSD Models.
Args: Args:
is_training: A boolean whether the network is in training mode. is_training: A boolean whether the network is in training mode.
...@@ -51,7 +52,7 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor): ...@@ -51,7 +52,7 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
min_depth: A number representing minimum feature extractor depth. min_depth: A number representing minimum feature extractor depth.
pad_to_multiple: The nearest multiple to zero pad the input height and pad_to_multiple: The nearest multiple to zero pad the input height and
width dimensions to. width dimensions to.
conv_hyperparams: A function to construct tf slim arg_scope for conv2d conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the and separable_conv2d ops in the layers that are added on top of the
base feature extractor. base feature extractor.
reuse_weights: Whether to reuse variables. Default is None. reuse_weights: Whether to reuse variables. Default is None.
...@@ -63,9 +64,9 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor): ...@@ -63,9 +64,9 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
`conv_hyperparams_fn`. `conv_hyperparams_fn`.
lstm_state_depth: An integter of the depth of the lstm state. lstm_state_depth: An integter of the depth of the lstm state.
""" """
super(LSTMMobileNetV1FeatureExtractor, self).__init__( super(LSTMSSDMobileNetV1FeatureExtractor, self).__init__(
is_training, depth_multiplier, min_depth, pad_to_multiple, is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise, conv_hyperparams_fn, reuse_weights, use_explicit_padding, use_depthwise,
override_base_feature_extractor_hyperparams) override_base_feature_extractor_hyperparams)
self._feature_map_layout = { self._feature_map_layout = {
'from_layer': ['Conv2d_13_pointwise_lstm', '', '', '', ''], 'from_layer': ['Conv2d_13_pointwise_lstm', '', '', '', ''],
...@@ -76,6 +77,37 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor): ...@@ -76,6 +77,37 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
self._base_network_scope = 'MobilenetV1' self._base_network_scope = 'MobilenetV1'
self._lstm_state_depth = lstm_state_depth self._lstm_state_depth = lstm_state_depth
def create_lstm_cell(self, batch_size, output_size, state_saver, state_name):
"""Create the LSTM cell, and initialize state if necessary.
Args:
batch_size: input batch size.
output_size: output size of the lstm cell, [width, height].
state_saver: a state saver object with methods `state` and `save_state`.
state_name: string, the name to use with the state_saver.
Returns:
lstm_cell: the lstm cell unit.
init_state: initial state representations.
step: the step
"""
lstm_cell = lstm_cells.BottleneckConvLSTMCell(
filter_size=(3, 3),
output_size=output_size,
num_units=max(self._min_depth, self._lstm_state_depth),
activation=tf.nn.relu6,
visualize_gates=False)
if state_saver is None:
init_state = lstm_cell.init_state(state_name, batch_size, tf.float32)
step = None
else:
step = state_saver.state(state_name + '_step')
c = state_saver.state(state_name + '_c')
h = state_saver.state(state_name + '_h')
init_state = (c, h)
return lstm_cell, init_state, step
def extract_features(self, def extract_features(self,
preprocessed_inputs, preprocessed_inputs,
state_saver=None, state_saver=None,
...@@ -126,22 +158,12 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor): ...@@ -126,22 +158,12 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
with slim.arg_scope( with slim.arg_scope(
[slim.batch_norm], fused=False, is_training=self._is_training): [slim.batch_norm], fused=False, is_training=self._is_training):
# ConvLSTM layers. # ConvLSTM layers.
batch_size = net.shape[0].value / unroll_length
with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope: with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope:
lstm_cell = lstm_cells.BottleneckConvLSTMCell( lstm_cell, init_state, _ = self.create_lstm_cell(
filter_size=(3, 3), batch_size, (net.shape[1].value, net.shape[2].value), state_saver,
output_size=(net.shape[1].value, net.shape[2].value), state_name)
num_units=max(self._min_depth, self._lstm_state_depth),
activation=tf.nn.relu6,
visualize_gates=True)
net_seq = list(tf.split(net, unroll_length)) net_seq = list(tf.split(net, unroll_length))
if state_saver is None:
init_state = lstm_cell.init_state(
state_name, net.shape[0].value / unroll_length, tf.float32)
else:
c = state_saver.state('%s_c' % state_name)
h = state_saver.state('%s_h' % state_name)
init_state = (c, h)
# Identities added for inputing state tensors externally. # Identities added for inputing state tensors externally.
c_ident = tf.identity(init_state[0], name='lstm_state_in_c') c_ident = tf.identity(init_state[0], name='lstm_state_in_c')
...@@ -157,7 +179,7 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor): ...@@ -157,7 +179,7 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
batcher_ops = [ batcher_ops = [
state_saver.save_state('%s_c' % state_name, states_out[-1][0]), state_saver.save_state('%s_c' % state_name, states_out[-1][0]),
state_saver.save_state('%s_h' % state_name, states_out[-1][1]), state_saver.save_state('%s_h' % state_name, states_out[-1][1]),
state_saver.save_state('%s_step' % state_name, self._step - 1) state_saver.save_state('%s_step' % state_name, self._step + 1)
] ]
with tf_ops.control_dependencies(batcher_ops): with tf_ops.control_dependencies(batcher_ops):
image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0) image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0)
......
...@@ -42,11 +42,11 @@ class LstmSsdMobilenetV1FeatureExtractorTest( ...@@ -42,11 +42,11 @@ class LstmSsdMobilenetV1FeatureExtractorTest(
use_explicit_padding: A boolean whether to use explicit padding. use_explicit_padding: A boolean whether to use explicit padding.
Returns: Returns:
An lstm_ssd_meta_arch.LSTMMobileNetV1FeatureExtractor object. An lstm_ssd_meta_arch.LSTMSSDMobileNetV1FeatureExtractor object.
""" """
min_depth = 32 min_depth = 32
extractor = ( extractor = (
feature_extactor.LSTMMobileNetV1FeatureExtractor( feature_extactor.LSTMSSDMobileNetV1FeatureExtractor(
is_training, is_training,
depth_multiplier, depth_multiplier,
min_depth, min_depth,
......
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Definitions for modified MobileNet models used in LSTD."""
import tensorflow as tf
from nets import mobilenet_v1
from nets.mobilenet import conv_blocks as mobilenet_convs
from nets.mobilenet import mobilenet
slim = tf.contrib.slim
def mobilenet_v1_lite_def(depth_multiplier, low_res=False):
"""Conv definitions for a lite MobileNet v1 model.
Args:
depth_multiplier: float depth multiplier for MobileNet.
low_res: An option of low-res conv input for interleave model.
Returns:
Array of convolutions.
Raises:
ValueError: On invalid channels with provided depth multiplier.
"""
conv = mobilenet_v1.Conv
sep_conv = mobilenet_v1.DepthSepConv
def _find_target_depth(original, depth_multiplier):
# Find the target depth such that:
# int(target * depth_multiplier) == original
pseudo_target = int(original / depth_multiplier)
for target in range(pseudo_target - 1, pseudo_target + 2):
if int(target * depth_multiplier) == original:
return target
raise ValueError('Cannot have %d channels with depth multiplier %0.2f' %
(original, depth_multiplier))
return [
conv(kernel=[3, 3], stride=2, depth=32),
sep_conv(kernel=[3, 3], stride=1, depth=64),
sep_conv(kernel=[3, 3], stride=2, depth=128),
sep_conv(kernel=[3, 3], stride=1, depth=128),
sep_conv(kernel=[3, 3], stride=2, depth=256),
sep_conv(kernel=[3, 3], stride=1, depth=256),
sep_conv(kernel=[3, 3], stride=2, depth=512),
sep_conv(kernel=[3, 3], stride=1, depth=512),
sep_conv(kernel=[3, 3], stride=1, depth=512),
sep_conv(kernel=[3, 3], stride=1, depth=512),
sep_conv(kernel=[3, 3], stride=1, depth=512),
sep_conv(kernel=[3, 3], stride=1, depth=512),
sep_conv(kernel=[3, 3], stride=1 if low_res else 2, depth=1024),
sep_conv(
kernel=[3, 3],
stride=1,
depth=int(_find_target_depth(1024, depth_multiplier)))
]
def mobilenet_v2_lite_def(reduced=False, is_quantized=False, low_res=False):
"""Conv definitions for a lite MobileNet v2 model.
Args:
reduced: Determines the scaling factor for expanded conv. If True, a factor
of 6 is used. If False, a factor of 3 is used.
is_quantized: Whether the model is trained in quantized mode.
low_res: Whether the input to the model is of half resolution.
Returns:
Array of convolutions.
"""
expanded_conv = mobilenet_convs.expanded_conv
expand_input = mobilenet_convs.expand_input_by_factor
op = mobilenet.op
return dict(
defaults={
# Note: these parameters of batch norm affect the architecture
# that's why they are here and not in training_scope.
(slim.batch_norm,): {
'center': True,
'scale': True
},
(slim.conv2d, slim.fully_connected, slim.separable_conv2d): {
'normalizer_fn': slim.batch_norm,
'activation_fn': tf.nn.relu6
},
(expanded_conv,): {
'expansion_size': expand_input(6),
'split_expansion': 1,
'normalizer_fn': slim.batch_norm,
'residual': True
},
(slim.conv2d, slim.separable_conv2d): {
'padding': 'SAME'
}
},
spec=[
op(slim.conv2d, stride=2, num_outputs=32, kernel_size=[3, 3]),
op(expanded_conv,
expansion_size=expand_input(1, divisible_by=1),
num_outputs=16),
op(expanded_conv,
expansion_size=(expand_input(3, divisible_by=1)
if reduced else expand_input(6)),
stride=2,
num_outputs=24),
op(expanded_conv,
expansion_size=(expand_input(3, divisible_by=1)
if reduced else expand_input(6)),
stride=1,
num_outputs=24),
op(expanded_conv, stride=2, num_outputs=32),
op(expanded_conv, stride=1, num_outputs=32),
op(expanded_conv, stride=1, num_outputs=32),
op(expanded_conv, stride=2, num_outputs=64),
op(expanded_conv, stride=1, num_outputs=64),
op(expanded_conv, stride=1, num_outputs=64),
op(expanded_conv, stride=1, num_outputs=64),
op(expanded_conv, stride=1, num_outputs=96),
op(expanded_conv, stride=1, num_outputs=96),
op(expanded_conv, stride=1, num_outputs=96),
op(expanded_conv, stride=1 if low_res else 2, num_outputs=160),
op(expanded_conv, stride=1, num_outputs=160),
op(expanded_conv, stride=1, num_outputs=160),
op(expanded_conv,
stride=1,
num_outputs=320,
project_activation_fn=(tf.nn.relu6
if is_quantized else tf.identity))
],
)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for lstm_object_detection.models.mobilenet_defs."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from lstm_object_detection.models import mobilenet_defs
from nets import mobilenet_v1
from nets.mobilenet import mobilenet_v2
class MobilenetV1DefsTest(tf.test.TestCase):
def test_mobilenet_v1_lite_def(self):
net, _ = mobilenet_v1.mobilenet_v1_base(
tf.placeholder(tf.float32, (10, 320, 320, 3)),
final_endpoint='Conv2d_13_pointwise',
min_depth=8,
depth_multiplier=1.0,
conv_defs=mobilenet_defs.mobilenet_v1_lite_def(1.0),
use_explicit_padding=True,
scope='MobilenetV1')
self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 1024])
def test_mobilenet_v1_lite_def_depthmultiplier_half(self):
net, _ = mobilenet_v1.mobilenet_v1_base(
tf.placeholder(tf.float32, (10, 320, 320, 3)),
final_endpoint='Conv2d_13_pointwise',
min_depth=8,
depth_multiplier=0.5,
conv_defs=mobilenet_defs.mobilenet_v1_lite_def(0.5),
use_explicit_padding=True,
scope='MobilenetV1')
self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 1024])
def test_mobilenet_v1_lite_def_depthmultiplier_2x(self):
net, _ = mobilenet_v1.mobilenet_v1_base(
tf.placeholder(tf.float32, (10, 320, 320, 3)),
final_endpoint='Conv2d_13_pointwise',
min_depth=8,
depth_multiplier=2.0,
conv_defs=mobilenet_defs.mobilenet_v1_lite_def(2.0),
use_explicit_padding=True,
scope='MobilenetV1')
self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 1024])
def test_mobilenet_v1_lite_def_low_res(self):
net, _ = mobilenet_v1.mobilenet_v1_base(
tf.placeholder(tf.float32, (10, 320, 320, 3)),
final_endpoint='Conv2d_13_pointwise',
min_depth=8,
depth_multiplier=1.0,
conv_defs=mobilenet_defs.mobilenet_v1_lite_def(1.0, low_res=True),
use_explicit_padding=True,
scope='MobilenetV1')
self.assertEqual(net.get_shape().as_list(), [10, 20, 20, 1024])
class MobilenetV2DefsTest(tf.test.TestCase):
def test_mobilenet_v2_lite_def(self):
net, features = mobilenet_v2.mobilenet_base(
tf.placeholder(tf.float32, (10, 320, 320, 3)),
min_depth=8,
depth_multiplier=1.0,
conv_defs=mobilenet_defs.mobilenet_v2_lite_def(),
use_explicit_padding=True,
scope='MobilenetV2')
self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 320])
self._assert_contains_op('MobilenetV2/expanded_conv_16/project/Identity')
self.assertEqual(
features['layer_3/expansion_output'].get_shape().as_list(),
[10, 160, 160, 96])
self.assertEqual(
features['layer_4/expansion_output'].get_shape().as_list(),
[10, 80, 80, 144])
def test_mobilenet_v2_lite_def_is_quantized(self):
net, _ = mobilenet_v2.mobilenet_base(
tf.placeholder(tf.float32, (10, 320, 320, 3)),
min_depth=8,
depth_multiplier=1.0,
conv_defs=mobilenet_defs.mobilenet_v2_lite_def(is_quantized=True),
use_explicit_padding=True,
scope='MobilenetV2')
self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 320])
self._assert_contains_op('MobilenetV2/expanded_conv_16/project/Relu6')
def test_mobilenet_v2_lite_def_low_res(self):
net, _ = mobilenet_v2.mobilenet_base(
tf.placeholder(tf.float32, (10, 320, 320, 3)),
min_depth=8,
depth_multiplier=1.0,
conv_defs=mobilenet_defs.mobilenet_v2_lite_def(low_res=True),
use_explicit_padding=True,
scope='MobilenetV2')
self.assertEqual(net.get_shape().as_list(), [10, 20, 20, 320])
def test_mobilenet_v2_lite_def_reduced(self):
net, features = mobilenet_v2.mobilenet_base(
tf.placeholder(tf.float32, (10, 320, 320, 3)),
min_depth=8,
depth_multiplier=1.0,
conv_defs=mobilenet_defs.mobilenet_v2_lite_def(reduced=True),
use_explicit_padding=True,
scope='MobilenetV2')
self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 320])
self.assertEqual(
features['layer_3/expansion_output'].get_shape().as_list(),
[10, 160, 160, 48])
self.assertEqual(
features['layer_4/expansion_output'].get_shape().as_list(),
[10, 80, 80, 72])
def _assert_contains_op(self, op_name):
op_names = [op.name for op in tf.get_default_graph().get_operations()]
self.assertIn(op_name, op_names)
if __name__ == '__main__':
tf.test.main()
syntax = "proto2"; syntax = "proto2";
package lstm_object_detection.input_readers; package lstm_object_detection.protos;
import "object_detection/protos/input_reader.proto"; import "object_detection/protos/input_reader.proto";
...@@ -20,9 +20,8 @@ message TFRecordVideoInputReader { ...@@ -20,9 +20,8 @@ message TFRecordVideoInputReader {
enum DataType { enum DataType {
UNSPECIFIED = 0; UNSPECIFIED = 0;
ANNOTATED_IMAGE = 1; TF_EXAMPLE = 1;
TF_EXAMPLE = 2; TF_SEQUENCE_EXAMPLE = 2;
TF_SEQUENCE_EXAMPLE = 3;
} }
optional DataType data_type = 2 [default=TF_SEQUENCE_EXAMPLE]; optional DataType data_type = 2 [default=TF_SEQUENCE_EXAMPLE];
......
syntax = "proto2"; syntax = "proto2";
package object_detection.protos; package lstm_object_detection.protos;
import "object_detection/protos/pipeline.proto"; import "object_detection/protos/pipeline.proto";
import "lstm_object_detection/protos/quant_overrides.proto";
extend TrainEvalPipelineConfig { extend object_detection.protos.TrainEvalPipelineConfig {
optional LstmModel lstm_model = 205743444; optional LstmModel lstm_model = 205743444;
optional QuantOverrides quant_overrides = 246059837;
} }
// Message for extra fields needed for configuring LSTM model. // Message for extra fields needed for configuring LSTM model.
...@@ -18,4 +20,50 @@ message LstmModel { ...@@ -18,4 +20,50 @@ message LstmModel {
// Depth of the lstm feature map. // Depth of the lstm feature map.
optional int32 lstm_state_depth = 3 [default = 256]; optional int32 lstm_state_depth = 3 [default = 256];
// Depth multipliers for multiple feature extractors. Used for interleaved
// or ensemble model.
repeated float depth_multipliers = 4;
// Specifies how models are interleaved when multiple feature extractors are
// used during training. Must be in ['RANDOM', 'RANDOM_SKIP_SMALL'].
optional string train_interleave_method = 5 [default = 'RANDOM'];
// Specifies how models are interleaved when multiple feature extractors are
// used during training. Must be in ['RANDOM', 'RANDOM_SKIP', 'SKIPK'].
optional string eval_interleave_method = 6 [default = 'SKIP9'];
// The stride of the lstm state.
optional int32 lstm_state_stride = 7 [default = 32];
// Whether to flattern LSTM state and output. Note that this is typically
// intended only to be modified internally by export_tfmini_lstd_graph_lib
// to support flatten state for tfmini/tflite. Do not set this field in
// the pipeline config file unless necessary.
optional bool flatten_state = 8 [default = false];
// Whether to apply bottleneck layer before going into LSTM gates. This
// allows multiple feature extractors to use separate bottleneck layers
// instead of sharing the same one so that different base model output
// feature dimensions are not forced to be the same.
// For example:
// Model 1 outputs feature map f_1 of depth d_1.
// Model 2 outputs feature map f_2 of depth d_2.
// Pre-bottlenecking allows lstm input to be either:
// conv(concat([f_1, h])) or conv(concat([f_2, h])).
optional bool pre_bottleneck = 9 [default = false];
// Normalize LSTM state, default false.
optional bool scale_state = 10 [default = false];
// Clip LSTM state at [0, 6], default true.
optional bool clip_state = 11 [default = true];
// If the model is in quantized training. This field does NOT need to be set
// manually. Instead, it will be overridden by configs in graph_rewriter.
optional bool is_quantized = 12 [default = false];
// Downsample input image when using the smaller network in interleaved
// models, default false.
optional bool low_res = 13 [default = false];
} }
syntax = "proto2";
package lstm_object_detection.protos;
// Message to override default quantization behavior.
message QuantOverrides {
repeated QuantConfig quant_configs = 1;
}
// Parameters to manually create fake quant ops outside of the generic
// tensorflow/contrib/quantize/python/quantize.py script. This may be
// used to override default behaviour or quantize ops not already supported.
message QuantConfig {
// The name of the op to add a fake quant op to.
required string op_name = 1;
// The name of the fake quant op.
required string quant_op_name = 2;
// Whether the fake quant op uses fixed ranges. Otherwise, learned moving
// average ranges are used.
required bool fixed_range = 3 [default = false];
// The intitial minimum value of the range.
optional float min = 4 [default = -6];
// The initial maximum value of the range.
optional float max = 5 [default = 6];
// Number of steps to delay before quantization takes effect during training.
optional int32 delay = 6 [default = 500000];
// Number of bits to use for quantizing weights.
// Only 8 bit is supported for now.
optional int32 weight_bits = 7 [default = 8];
// Number of bits to use for quantizing activations.
// Only 8 bit is supported for now.
optional int32 activation_bits = 8 [default = 8];
}
...@@ -21,7 +21,6 @@ DetectionModel. ...@@ -21,7 +21,6 @@ DetectionModel.
import functools import functools
import tensorflow as tf import tensorflow as tf
from google3.pyglib import logging
from object_detection.builders import optimizer_builder from object_detection.builders import optimizer_builder
from object_detection.core import standard_fields as fields from object_detection.core import standard_fields as fields
...@@ -200,7 +199,7 @@ def get_restore_checkpoint_ops(restore_checkpoints, detection_model, ...@@ -200,7 +199,7 @@ def get_restore_checkpoint_ops(restore_checkpoints, detection_model,
var_map, restore_checkpoint)) var_map, restore_checkpoint))
for var_name, var in available_var_map.iteritems(): for var_name, var in available_var_map.iteritems():
if var in vars_restored: if var in vars_restored:
logging.info('Variable %s contained in multiple checkpoints', tf.logging.info('Variable %s contained in multiple checkpoints',
var.op.name) var.op.name)
del available_var_map[var_name] del available_var_map[var_name]
else: else:
...@@ -221,7 +220,7 @@ def get_restore_checkpoint_ops(restore_checkpoints, detection_model, ...@@ -221,7 +220,7 @@ def get_restore_checkpoint_ops(restore_checkpoints, detection_model,
if available_var_map.keys(): if available_var_map.keys():
restorers.append(init_saver) restorers.append(init_saver)
else: else:
logging.info('WARNING: Checkpoint %s has no restorable variables', tf.logging.info('WARNING: Checkpoint %s has no restorable variables',
restore_checkpoint) restore_checkpoint)
return restorers return restorers
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment