Merged commit includes the following changes: (#6726)

246873701 by menglong: Missing __init__.py under meta_architectures/ -- 246857392 by menglong: Standardize proto namespace: lstm_object_detection.protos -- 246625127 by menglong: Internal changes. -- 246596481 by menglong: Add License -- 246580605 by menglong: Internal changes -- 246344626 by menglong: Open source interleaved mobilenet v2 model. -- 244893883 by menglong: Introduce multi_input_decoder for interleaved model. -- 244461016 by menglong: Add pre-bottleneck operation to lstm cells to support interleaved model. -- 244052176 by menglong: Update README -- 244020495 by menglong: Add test to rnn_decoder. -- 243704250 by menglong: Duplicate assignment. -- 243091836 by menglong: Move LSTMSSD meta arch into separate folder -- 242900337 by menglong: Modified mobilenet definition for LSTM-SSD -- 242773195 by menglong: Release GroupedConvLSTMCell implementation: https://arxiv.org/abs/1903.10172 -- 242574736 by menglong: Introduce module for quantizated training. -- 242544306 by menglong: lstm_ssd_meta_arch updates, added test rename: - LSTMMetaArch to LSTMSSDMetaArch - LSTMFeatureExtractor to LSTMSSDFeatureExtractor -- 241986236 by menglong: Move lstm quantization utils to 3rd party. -- 225922488 by yinxiao: Training pipeline fixes. -- 224839137 by yinxiao: Issue fix for lstm object detecion sample config. -- 224246947 by menglong: Fix logging module import -- PiperOrigin-RevId: 246873701

Merged commit includes the following changes: (#6726)
246873701 by menglong: Missing __init__.py under meta_architectures/ -- 246857392 by menglong: Standardize proto namespace: lstm_object_detection.protos -- 246625127 by menglong: Internal changes. -- 246596481 by menglong: Add License -- 246580605 by menglong: Internal changes -- 246344626 by menglong: Open source interleaved mobilenet v2 model. -- 244893883 by menglong: Introduce multi_input_decoder for interleaved model. -- 244461016 by menglong: Add pre-bottleneck operation to lstm cells to support interleaved model. -- 244052176 by menglong: Update README -- 244020495 by menglong: Add test to rnn_decoder. -- 243704250 by menglong: Duplicate assignment. -- 243091836 by menglong: Move LSTMSSD meta arch into separate folder -- 242900337 by menglong: Modified mobilenet definition for LSTM-SSD -- 242773195 by menglong: Release GroupedConvLSTMCell implementation: https://arxiv.org/abs/1903.10172 -- 242574736 by menglong: Introduce module for quantizated training. -- 242544306 by menglong: lstm_ssd_meta_arch updates, added test rename: - LSTMMetaArch to LSTMSSDMetaArch - LSTMFeatureExtractor to LSTMSSDFeatureExtractor -- 241986236 by menglong: Move lstm quantization utils to 3rd party. -- 225922488 by yinxiao: Training pipeline fixes. -- 224839137 by yinxiao: Issue fix for lstm object detecion sample config. -- 224246947 by menglong: Fix logging module import -- PiperOrigin-RevId: 246873701
58856e2b · Menglong Zhu · GitHub · f5073f49 · 58856e2b · 58856e2b
Unverified Commit 58856e2b authored May 07, 2019 by Menglong Zhu Committed by GitHub May 07, 2019
15 changed files
--- a/research/lstm_object_detection/lstm/lstm_meta_arch.py
+++ b/research/lstm_object_detection/lstm/lstm_meta_arch.py
@@ -13,17 +13,21 @@
 # limitations under the License.
 # ==============================================================================
-"""LSTM Meta-architecture definition.
+"""LSTM SSD Meta-architecture definition.
 General tensorflow implementation of convolutional Multibox/SSD detection
-models with LSTM states, for use on video data.
+models with LSTM states, for use on video data. This implementation supports
+both regular LSTM-SSD and interleaved LSTM-SSD framework.
-See https://arxiv.org/abs/1711.06368 for details.
+See https://arxiv.org/abs/1711.06368 and https://arxiv.org/abs/1903.10172
+for details.
 """
+import abc
 import re
 import tensorflow as tf
 from object_detection.core import box_list_ops
+from object_detection.core import matcher
 from object_detection.core import standard_fields as fields
 from object_detection.meta_architectures import ssd_meta_arch
 from object_detection.utils import ops
@@ -32,7 +36,7 @@ from object_detection.utils import shape_utils
 slim = tf.contrib.slim
-class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
+class LSTMSSDMetaArch(ssd_meta_arch.SSDMetaArch):
  """LSTM Meta-architecture definition."""
  def __init__(self,
@@ -54,7 +58,7 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
               unroll_length,
               target_assigner_instance,
               add_summaries=True):
-    super(LSTMMetaArch, self).__init__(
+    super(LSTMSSDMetaArch, self).__init__(
        is_training=is_training,
        anchor_generator=anchor_generator,
        box_predictor=box_predictor,
@@ -94,26 +98,19 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
        preprocessed_inputs)
    self._batch_size = preprocessed_inputs.shape[0].value / self._unroll_length
    self._states = states
-    self._anchors = box_list_ops.concatenate(
+    anchors = self._anchor_generator.generate(feature_map_spatial_dims,
-        self._anchor_generator.generate(
+                                              im_height=image_shape[1],
-            feature_map_spatial_dims,
+                                              im_width=image_shape[2])
-            im_height=image_shape[1],
+    with tf.variable_scope('MultipleGridAnchorGenerator', reuse=tf.AUTO_REUSE):
-            im_width=image_shape[2]))
+      self._anchors = box_list_ops.concatenate(anchors)
    prediction_dict = self._box_predictor.predict(
        feature_maps, self._anchor_generator.num_anchors_per_location())
+    with tf.variable_scope('Loss', reuse=tf.AUTO_REUSE):
-    # Multiscale_anchor_generator currently has a different dim compared to
-    # ssd_anchor_generator. Current fix is to check the dim of the box_encodings
-    # tensor. If dim is not 3(multiscale_anchor_generator), squeeze the 3rd dim.
-    # TODO(yinxiao): Remove this check once the anchor generator has unified
-    # dimension.
-    if len(prediction_dict['box_encodings'][0].get_shape().as_list()) == 3:
      box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1)
-    else:
+      if box_encodings.shape.ndims == 4 and box_encodings.shape[2] == 1:
-      box_encodings = tf.squeeze(
+        box_encodings = tf.squeeze(box_encodings, axis=2)
-          tf.concat(prediction_dict['box_encodings'], axis=1), axis=2)
+      class_predictions_with_background = tf.concat(
-    class_predictions_with_background = tf.concat(
+          prediction_dict['class_predictions_with_background'], axis=1)
-        prediction_dict['class_predictions_with_background'], axis=1)
    predictions_dict = {
        'preprocessed_inputs': preprocessed_inputs,
        'box_encodings': box_encodings,
@@ -161,10 +158,11 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
      if self.groundtruth_has_field(fields.BoxListFields.weights):
        weights = self.groundtruth_lists(fields.BoxListFields.weights)
      (batch_cls_targets, batch_cls_weights, batch_reg_targets,
-       batch_reg_weights, match_list) = self._assign_targets(
+       batch_reg_weights, batch_match) = self._assign_targets(
           self.groundtruth_lists(fields.BoxListFields.boxes),
           self.groundtruth_lists(fields.BoxListFields.classes),
           keypoints, weights)
+      match_list = [matcher.Match(match) for match in tf.unstack(batch_match)]
      if self._add_summaries:
        self._summarize_target_assignment(
            self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
@@ -275,8 +273,18 @@ class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
    return self._feature_extractor.get_base_network_scope()
-class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
+class LSTMSSDFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
-  """LSTM Meta-architecture  Feature Extractor definition."""
+  """LSTM SSD Meta-architecture Feature Extractor definition."""
+  __metaclass__ = abc.ABCMeta
+  @property
+  def clip_state(self):
+    return self._clip_state
+  @clip_state.setter
+  def clip_state(self, clip_state):
+    self._clip_state = clip_state
  @property
  def depth_multipliers(self):
@@ -294,6 +302,18 @@ class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
  def lstm_state_depth(self, lstm_state_depth):
    self._lstm_state_depth = lstm_state_depth
+  @property
+  def is_quantized(self):
+    return self._is_quantized
+  @is_quantized.setter
+  def is_quantized(self, is_quantized):
+    self._is_quantized = is_quantized
+  @property
+  def interleaved(self):
+    return False
  @property
  def states_and_outputs(self):
    """LSTM states and outputs.
@@ -332,3 +352,81 @@ class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
      The variable scope of the base network, e.g. MobilenetV1
    """
    return self._base_network_scope
+  @abc.abstractmethod
+  def create_lstm_cell(self, batch_size, output_size, state_saver, state_name):
+    """Create the LSTM cell, and initialize state if necessary.
+    Args:
+      batch_size: input batch size.
+      output_size: output size of the lstm cell, [width, height].
+      state_saver: a state saver object with methods `state` and `save_state`.
+      state_name: string, the name to use with the state_saver.
+    Returns:
+      lstm_cell: the lstm cell unit.
+      init_state: initial state representations.
+      step: the step
+    """
+    pass
+class LSTMSSDInterleavedFeatureExtractor(LSTMSSDFeatureExtractor):
+  """LSTM SSD Meta-architecture Interleaved Feature Extractor definition."""
+  __metaclass__ = abc.ABCMeta
+  @property
+  def pre_bottleneck(self):
+    return self._pre_bottleneck
+  @pre_bottleneck.setter
+  def pre_bottleneck(self, pre_bottleneck):
+    self._pre_bottleneck = pre_bottleneck
+  @property
+  def low_res(self):
+    return self._low_res
+  @low_res.setter
+  def low_res(self, low_res):
+    self._low_res = low_res
+  @property
+  def interleaved(self):
+    return True
+  @property
+  def interleave_method(self):
+    return self._interleave_method
+  @interleave_method.setter
+  def interleave_method(self, interleave_method):
+    self._interleave_method = interleave_method
+  @abc.abstractmethod
+  def extract_base_features_large(self, preprocessed_inputs):
+    """Extract the large base model features.
+    Args:
+      preprocessed_inputs: preprocessed input images of shape:
+        [batch, width, height, depth].
+    Returns:
+      net: the last feature map created from the base feature extractor.
+      end_points: a dictionary of feature maps created.
+    """
+    pass
+  @abc.abstractmethod
+  def extract_base_features_small(self, preprocessed_inputs):
+    """Extract the small base model features.
+    Args:
+      preprocessed_inputs: preprocessed input images of shape:
+        [batch, width, height, depth].
+    Returns:
+      net: the last feature map created from the base feature extractor.
+      end_points: a dictionary of feature maps created.
+    """
+    pass
--- a/research/lstm_object_detection/meta_architectures/lstm_ssd_meta_arch_test.py
+++ b/research/lstm_object_detection/meta_architectures/lstm_ssd_meta_arch_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for meta_architectures.lstm_ssd_meta_arch."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import functools
+import numpy as np
+import tensorflow as tf
+from lstm_object_detection.lstm import lstm_cells
+from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
+from object_detection.core import anchor_generator
+from object_detection.core import box_list
+from object_detection.core import losses
+from object_detection.core import post_processing
+from object_detection.core import region_similarity_calculator as sim_calc
+from object_detection.core import standard_fields as fields
+from object_detection.core import target_assigner
+from object_detection.models import feature_map_generators
+from object_detection.utils import test_case
+from object_detection.utils import test_utils
+slim = tf.contrib.slim
+MAX_TOTAL_NUM_BOXES = 5
+NUM_CLASSES = 1
+class FakeLSTMFeatureExtractor(
+    lstm_ssd_meta_arch.LSTMSSDFeatureExtractor):
+  def __init__(self):
+    super(FakeLSTMFeatureExtractor, self).__init__(
+        is_training=True,
+        depth_multiplier=1.0,
+        min_depth=0,
+        pad_to_multiple=1,
+        conv_hyperparams_fn=self.scope_fn)
+    self._lstm_state_depth = 256
+  def scope_fn(self):
+    with slim.arg_scope([slim.conv2d], activation_fn=tf.nn.relu6) as sc:
+      return sc
+  def create_lstm_cell(self):
+    pass
+  def extract_features(self, preprocessed_inputs, state_saver=None,
+                       state_name='lstm_state', unroll_length=5, scope=None):
+    with tf.variable_scope('mock_model'):
+      net = slim.conv2d(inputs=preprocessed_inputs, num_outputs=32,
+                        kernel_size=1, scope='layer1')
+      image_features = {'last_layer': net}
+    self._states_out = {}
+    feature_map_layout = {
+        'from_layer': ['last_layer'],
+        'layer_depth': [-1],
+        'use_explicit_padding': self._use_explicit_padding,
+        'use_depthwise': self._use_depthwise,
+    }
+    feature_maps = feature_map_generators.multi_resolution_feature_maps(
+        feature_map_layout=feature_map_layout,
+        depth_multiplier=(self._depth_multiplier),
+        min_depth=self._min_depth,
+        insert_1x1_conv=True,
+        image_features=image_features)
+    return feature_maps.values()
+class FakeLSTMInterleavedFeatureExtractor(
+    lstm_ssd_meta_arch.LSTMSSDInterleavedFeatureExtractor):
+  def __init__(self):
+    super(FakeLSTMInterleavedFeatureExtractor, self).__init__(
+        is_training=True,
+        depth_multiplier=1.0,
+        min_depth=0,
+        pad_to_multiple=1,
+        conv_hyperparams_fn=self.scope_fn)
+    self._lstm_state_depth = 256
+  def scope_fn(self):
+    with slim.arg_scope([slim.conv2d], activation_fn=tf.nn.relu6) as sc:
+      return sc
+  def create_lstm_cell(self):
+    pass
+  def extract_base_features_large(self, preprocessed_inputs):
+    with tf.variable_scope('base_large'):
+      net = slim.conv2d(inputs=preprocessed_inputs, num_outputs=32,
+                        kernel_size=1, scope='layer1')
+    return net
+  def extract_base_features_small(self, preprocessed_inputs):
+    with tf.variable_scope('base_small'):
+      net = slim.conv2d(inputs=preprocessed_inputs, num_outputs=32,
+                        kernel_size=1, scope='layer1')
+    return net
+  def extract_features(self, preprocessed_inputs, state_saver=None,
+                       state_name='lstm_state', unroll_length=5, scope=None):
+    with tf.variable_scope('mock_model'):
+      net_large = self.extract_base_features_large(preprocessed_inputs)
+      net_small = self.extract_base_features_small(preprocessed_inputs)
+      net = slim.conv2d(
+          inputs=tf.concat([net_large, net_small], axis=3),
+          num_outputs=32,
+          kernel_size=1,
+          scope='layer1')
+      image_features = {'last_layer': net}
+    self._states_out = {}
+    feature_map_layout = {
+        'from_layer': ['last_layer'],
+        'layer_depth': [-1],
+        'use_explicit_padding': self._use_explicit_padding,
+        'use_depthwise': self._use_depthwise,
+    }
+    feature_maps = feature_map_generators.multi_resolution_feature_maps(
+        feature_map_layout=feature_map_layout,
+        depth_multiplier=(self._depth_multiplier),
+        min_depth=self._min_depth,
+        insert_1x1_conv=True,
+        image_features=image_features)
+    return feature_maps.values()
+class MockAnchorGenerator2x2(anchor_generator.AnchorGenerator):
+  """Sets up a simple 2x2 anchor grid on the unit square."""
+  def name_scope(self):
+    return 'MockAnchorGenerator'
+  def num_anchors_per_location(self):
+    return [1]
+  def _generate(self, feature_map_shape_list, im_height, im_width):
+    return [box_list.BoxList(
+        tf.constant([[0, 0, .5, .5],
+                     [0, .5, .5, 1],
+                     [.5, 0, 1, .5],
+                     [1., 1., 1.5, 1.5]  # Anchor that is outside clip_window.
+                    ], tf.float32))]
+  def num_anchors(self):
+    return 4
+class LSTMSSDMetaArchTest(test_case.TestCase):
+  def _create_model(self,
+                    interleaved=False,
+                    apply_hard_mining=True,
+                    normalize_loc_loss_by_codesize=False,
+                    add_background_class=True,
+                    random_example_sampling=False,
+                    use_expected_classification_loss_under_sampling=False,
+                    min_num_negative_samples=1,
+                    desired_negative_sampling_ratio=3,
+                    unroll_length=1):
+    num_classes = NUM_CLASSES
+    is_training = False
+    mock_anchor_generator = MockAnchorGenerator2x2()
+    mock_box_predictor = test_utils.MockBoxPredictor(is_training, num_classes)
+    mock_box_coder = test_utils.MockBoxCoder()
+    if interleaved:
+      fake_feature_extractor = FakeLSTMInterleavedFeatureExtractor()
+    else:
+      fake_feature_extractor = FakeLSTMFeatureExtractor()
+    mock_matcher = test_utils.MockMatcher()
+    region_similarity_calculator = sim_calc.IouSimilarity()
+    encode_background_as_zeros = False
+    def image_resizer_fn(image):
+      return [tf.identity(image), tf.shape(image)]
+    classification_loss = losses.WeightedSigmoidClassificationLoss()
+    localization_loss = losses.WeightedSmoothL1LocalizationLoss()
+    non_max_suppression_fn = functools.partial(
+        post_processing.batch_multiclass_non_max_suppression,
+        score_thresh=-20.0,
+        iou_thresh=1.0,
+        max_size_per_class=5,
+        max_total_size=MAX_TOTAL_NUM_BOXES)
+    classification_loss_weight = 1.0
+    localization_loss_weight = 1.0
+    negative_class_weight = 1.0
+    normalize_loss_by_num_matches = False
+    hard_example_miner = None
+    if apply_hard_mining:
+      # This hard example miner is expected to be a no-op.
+      hard_example_miner = losses.HardExampleMiner(
+          num_hard_examples=None,
+          iou_threshold=1.0)
+    target_assigner_instance = target_assigner.TargetAssigner(
+        region_similarity_calculator,
+        mock_matcher,
+        mock_box_coder,
+        negative_class_weight=negative_class_weight)
+    code_size = 4
+    model = lstm_ssd_meta_arch.LSTMSSDMetaArch(
+        is_training=is_training,
+        anchor_generator=mock_anchor_generator,
+        box_predictor=mock_box_predictor,
+        box_coder=mock_box_coder,
+        feature_extractor=fake_feature_extractor,
+        encode_background_as_zeros=encode_background_as_zeros,
+        image_resizer_fn=image_resizer_fn,
+        non_max_suppression_fn=non_max_suppression_fn,
+        score_conversion_fn=tf.identity,
+        classification_loss=classification_loss,
+        localization_loss=localization_loss,
+        classification_loss_weight=classification_loss_weight,
+        localization_loss_weight=localization_loss_weight,
+        normalize_loss_by_num_matches=normalize_loss_by_num_matches,
+        hard_example_miner=hard_example_miner,
+        unroll_length=unroll_length,
+        target_assigner_instance=target_assigner_instance,
+        add_summaries=False)
+    return model, num_classes, mock_anchor_generator.num_anchors(), code_size
+  def _get_value_for_matching_key(self, dictionary, suffix):
+    for key in dictionary.keys():
+      if key.endswith(suffix):
+        return dictionary[key]
+    raise ValueError('key not found {}'.format(suffix))
+  def test_predict_returns_correct_items_and_sizes(self):
+    batch_size = 3
+    height = width = 2
+    num_unroll = 1
+    graph = tf.Graph()
+    with graph.as_default():
+      model, num_classes, num_anchors, code_size = self._create_model()
+      preprocessed_images = tf.random_uniform(
+          [batch_size * num_unroll, height, width, 3],
+          minval=-1.,
+          maxval=1.)
+      true_image_shapes = tf.tile(
+          [[height, width, 3]], [batch_size, 1])
+      prediction_dict = model.predict(preprocessed_images, true_image_shapes)
+      self.assertIn('preprocessed_inputs', prediction_dict)
+      self.assertIn('box_encodings', prediction_dict)
+      self.assertIn('class_predictions_with_background', prediction_dict)
+      self.assertIn('feature_maps', prediction_dict)
+      self.assertIn('anchors', prediction_dict)
+      self.assertAllEqual(
+          [batch_size * num_unroll, height, width, 3],
+          prediction_dict['preprocessed_inputs'].shape.as_list())
+      self.assertAllEqual(
+          [batch_size * num_unroll, num_anchors, code_size],
+          prediction_dict['box_encodings'].shape.as_list())
+      self.assertAllEqual(
+          [batch_size * num_unroll, num_anchors, num_classes + 1],
+          prediction_dict['class_predictions_with_background'].shape.as_list())
+      self.assertAllEqual(
+          [num_anchors, code_size],
+          prediction_dict['anchors'].shape.as_list())
+  def test_interleaved_predict_returns_correct_items_and_sizes(self):
+    batch_size = 3
+    height = width = 2
+    num_unroll = 1
+    graph = tf.Graph()
+    with graph.as_default():
+      model, num_classes, num_anchors, code_size = self._create_model(
+          interleaved=True)
+      preprocessed_images = tf.random_uniform(
+          [batch_size * num_unroll, height, width, 3],
+          minval=-1.,
+          maxval=1.)
+      true_image_shapes = tf.tile(
+          [[height, width, 3]], [batch_size, 1])
+      prediction_dict = model.predict(preprocessed_images, true_image_shapes)
+      self.assertIn('preprocessed_inputs', prediction_dict)
+      self.assertIn('box_encodings', prediction_dict)
+      self.assertIn('class_predictions_with_background', prediction_dict)
+      self.assertIn('feature_maps', prediction_dict)
+      self.assertIn('anchors', prediction_dict)
+      self.assertAllEqual(
+          [batch_size * num_unroll, height, width, 3],
+          prediction_dict['preprocessed_inputs'].shape.as_list())
+      self.assertAllEqual(
+          [batch_size * num_unroll, num_anchors, code_size],
+          prediction_dict['box_encodings'].shape.as_list())
+      self.assertAllEqual(
+          [batch_size * num_unroll, num_anchors, num_classes + 1],
+          prediction_dict['class_predictions_with_background'].shape.as_list())
+      self.assertAllEqual(
+          [num_anchors, code_size],
+          prediction_dict['anchors'].shape.as_list())
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/lstm_object_detection/model_builder.py
+++ b/research/lstm_object_detection/model_builder.py
@@ -14,8 +14,9 @@
 # ==============================================================================
 """A function to build a DetectionModel from configuration."""
-from lstm_object_detection.lstm import lstm_meta_arch
+from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
-from lstm_object_detection.models.lstm_ssd_mobilenet_v1_feature_extractor import LSTMMobileNetV1FeatureExtractor
+from lstm_object_detection.models import lstm_ssd_interleaved_mobilenet_v2_feature_extractor
+from lstm_object_detection.models import lstm_ssd_mobilenet_v1_feature_extractor
 from object_detection.builders import anchor_generator_builder
 from object_detection.builders import box_coder_builder
 from object_detection.builders import box_predictor_builder
@@ -29,7 +30,12 @@ from object_detection.builders import region_similarity_calculator_builder as si
 from object_detection.core import target_assigner
 model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP.update({
-    'lstm_mobilenet_v1': LSTMMobileNetV1FeatureExtractor,
+    'lstm_ssd_mobilenet_v1':
+        lstm_ssd_mobilenet_v1_feature_extractor
+        .LSTMSSDMobileNetV1FeatureExtractor,
+    'lstm_ssd_interleaved_mobilenet_v2':
+        lstm_ssd_interleaved_mobilenet_v2_feature_extractor
+        .LSTMSSDInterleavedMobilenetV2FeatureExtractor,
 })
 SSD_FEATURE_EXTRACTOR_CLASS_MAP = model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP
@@ -54,14 +60,14 @@ def build(model_config, lstm_config, is_training):
 def _build_lstm_feature_extractor(feature_extractor_config,
                                  is_training,
-                                  lstm_state_depth,
+                                  lstm_config,
                                  reuse_weights=None):
  """Builds a ssd_meta_arch.SSDFeatureExtractor based on config.
  Args:
    feature_extractor_config: A SSDFeatureExtractor proto config from ssd.proto.
    is_training: True if this feature extractor is being built for training.
-    lstm_state_depth: An integer of the depth of the lstm state.
+    lstm_config: LSTM-SSD specific configs.
    reuse_weights: If the feature extractor should reuse weights.
  Returns:
@@ -86,10 +92,27 @@ def _build_lstm_feature_extractor(feature_extractor_config,
    raise ValueError('Unknown ssd feature_extractor: {}'.format(feature_type))
  feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type]
-  return feature_extractor_class(
+  feature_extractor = feature_extractor_class(
      is_training, depth_multiplier, min_depth, pad_to_multiple,
      conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise,
-      override_base_feature_extractor_hyperparams, lstm_state_depth)
+      override_base_feature_extractor_hyperparams)
+  # Extra configs for LSTM-SSD.
+  feature_extractor.lstm_state_depth = lstm_config.lstm_state_depth
+  feature_extractor.flatten_state = lstm_config.flatten_state
+  feature_extractor.clip_state = lstm_config.clip_state
+  feature_extractor.scale_state = lstm_config.scale_state
+  feature_extractor.is_quantized = lstm_config.is_quantized
+  feature_extractor.low_res = lstm_config.low_res
+  # Extra configs for interleaved LSTM-SSD.
+  if 'interleaved' in feature_extractor_config.type:
+    feature_extractor.pre_bottleneck = lstm_config.pre_bottleneck
+    feature_extractor.depth_multipliers = lstm_config.depth_multipliers
+    if is_training:
+      feature_extractor.interleave_method = lstm_config.train_interleave_method
+    else:
+      feature_extractor.interleave_method = lstm_config.eval_interleave_method
+  return feature_extractor
 def _build_lstm_model(ssd_config, lstm_config, is_training):
@@ -97,19 +120,19 @@ def _build_lstm_model(ssd_config, lstm_config, is_training):
  Args:
    ssd_config: A ssd.proto object containing the config for the desired
-      LSTMMetaArch.
+      LSTMSSDMetaArch.
    lstm_config: LstmModel config proto that specifies LSTM train/eval configs.
    is_training: True if this model is being built for training purposes.
  Returns:
-    LSTMMetaArch based on the config.
+    LSTMSSDMetaArch based on the config.
  Raises:
    ValueError: If ssd_config.type is not recognized (i.e. not registered in
      model_class_map), or if lstm_config.interleave_strategy is not recognized.
    ValueError: If unroll_length is not specified in the config file.
  """
  feature_extractor = _build_lstm_feature_extractor(
-      ssd_config.feature_extractor, is_training, lstm_config.lstm_state_depth)
+      ssd_config.feature_extractor, is_training, lstm_config)
  box_coder = box_coder_builder.build(ssd_config.box_coder)
  matcher = matcher_builder.build(ssd_config.matcher)
@@ -147,7 +170,7 @@ def _build_lstm_model(ssd_config, lstm_config, is_training):
      box_coder,
      negative_class_weight=negative_class_weight)
-  lstm_model = lstm_meta_arch.LSTMMetaArch(
+  lstm_model = lstm_ssd_meta_arch.LSTMSSDMetaArch(
      is_training=is_training,
      anchor_generator=anchor_generator,
      box_predictor=ssd_box_predictor,

--- a/research/lstm_object_detection/model_builder_test.py
+++ b/research/lstm_object_detection/model_builder_test.py
@@ -13,19 +13,19 @@
 # limitations under the License.
 # ==============================================================================
-"""Tests for video_object_detection.tensorflow.model_builder."""
+"""Tests for lstm_object_detection.tensorflow.model_builder."""
 import tensorflow as tf
 from google.protobuf import text_format
 from lstm_object_detection import model_builder
-from lstm_object_detection.lstm import lstm_meta_arch
+from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
 from lstm_object_detection.protos import pipeline_pb2 as internal_pipeline_pb2
 from object_detection.protos import pipeline_pb2
 class ModelBuilderTest(tf.test.TestCase):
-  def create_model(self, model_config, lstm_config):
+  def create_train_model(self, model_config, lstm_config):
    """Builds a DetectionModel based on the model config.
    Args:
@@ -39,6 +39,20 @@ class ModelBuilderTest(tf.test.TestCase):
    """
    return model_builder.build(model_config, lstm_config, is_training=True)
+  def create_eval_model(self, model_config, lstm_config):
+    """Builds a DetectionModel based on the model config.
+    Args:
+      model_config: A model.proto object containing the config for the desired
+        DetectionModel.
+      lstm_config: LstmModel config proto that specifies LSTM train/eval
+        configs.
+    Returns:
+      DetectionModel based on the config.
+    """
+    return model_builder.build(model_config, lstm_config, is_training=False)
  def get_model_configs_from_proto(self):
    """Creates a model text proto for testing.
@@ -47,14 +61,110 @@ class ModelBuilderTest(tf.test.TestCase):
    """
    model_text_proto = """
-    [object_detection.protos.lstm_model] {
+    [lstm_object_detection.protos.lstm_model] {
      train_unroll_length: 4
      eval_unroll_length: 4
    }
    model {
      ssd {
        feature_extractor {
-          type: 'lstm_mobilenet_v1'
+          type: 'lstm_ssd_mobilenet_v1'
+          conv_hyperparams {
+            regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+          }
+        }
+        negative_class_weight: 2.0
+        box_coder {
+          faster_rcnn_box_coder {
+          }
+        }
+        matcher {
+          argmax_matcher {
+          }
+        }
+        similarity_calculator {
+          iou_similarity {
+          }
+        }
+        anchor_generator {
+          ssd_anchor_generator {
+            aspect_ratios: 1.0
+          }
+        }
+        image_resizer {
+          fixed_shape_resizer {
+            height: 320
+            width: 320
+          }
+        }
+        box_predictor {
+          convolutional_box_predictor {
+            conv_hyperparams {
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+          }
+        }
+        normalize_loc_loss_by_codesize: true
+        loss {
+          classification_loss {
+            weighted_softmax {
+            }
+          }
+          localization_loss {
+            weighted_smooth_l1 {
+            }
+          }
+        }
+      }
+    }"""
+    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+    text_format.Merge(model_text_proto, pipeline_config)
+    configs = {}
+    configs['model'] = pipeline_config.model
+    configs['lstm_model'] = pipeline_config.Extensions[
+        internal_pipeline_pb2.lstm_model]
+    return configs
+  def get_interleaved_model_configs_from_proto(self):
+    """Creates an interleaved model text proto for testing.
+    Returns:
+      A dictionary of model configs.
+    """
+    model_text_proto = """
+    [lstm_object_detection.protos.lstm_model] {
+      train_unroll_length: 4
+      eval_unroll_length: 10
+      lstm_state_depth: 320
+      depth_multipliers: 1.4
+      depth_multipliers: 0.35
+      pre_bottleneck: true
+      low_res: true
+      train_interleave_method: 'RANDOM_SKIP_SMALL'
+      eval_interleave_method: 'SKIP3'
+    }
+    model {
+      ssd {
+        feature_extractor {
+          type: 'lstm_ssd_interleaved_mobilenet_v2'
          conv_hyperparams {
            regularizer {
                l2_regularizer {
@@ -134,24 +244,58 @@ class ModelBuilderTest(tf.test.TestCase):
    self.assertEqual(configs['model'].ssd.negative_class_weight, 2.0)
    self.assertTrue(configs['model'].ssd.normalize_loc_loss_by_codesize)
    self.assertEqual(configs['model'].ssd.feature_extractor.type,
-                     'lstm_mobilenet_v1')
+                     'lstm_ssd_mobilenet_v1')
-    model = self.create_model(configs['model'], configs['lstm_model'])
+    model = self.create_train_model(configs['model'], configs['lstm_model'])
    # Test architechture type.
-    self.assertIsInstance(model, lstm_meta_arch.LSTMMetaArch)
+    self.assertIsInstance(model, lstm_ssd_meta_arch.LSTMSSDMetaArch)
    # Test LSTM unroll length.
    self.assertEqual(model.unroll_length, 4)
+    model = self.create_eval_model(configs['model'], configs['lstm_model'])
+    # Test architechture type.
+    self.assertIsInstance(model, lstm_ssd_meta_arch.LSTMSSDMetaArch)
+    # Test LSTM configs.
+    self.assertEqual(model.unroll_length, 4)
+  def test_interleaved_model_creation_from_valid_configs(self):
+    configs = self.get_interleaved_model_configs_from_proto()
+    # Test model properties.
+    self.assertEqual(configs['model'].ssd.negative_class_weight, 2.0)
+    self.assertTrue(configs['model'].ssd.normalize_loc_loss_by_codesize)
+    self.assertEqual(configs['model'].ssd.feature_extractor.type,
+                     'lstm_ssd_interleaved_mobilenet_v2')
+    model = self.create_train_model(configs['model'], configs['lstm_model'])
+    # Test architechture type.
+    self.assertIsInstance(model, lstm_ssd_meta_arch.LSTMSSDMetaArch)
+    # Test LSTM configs.
+    self.assertEqual(model.unroll_length, 4)
+    self.assertEqual(model._feature_extractor.lstm_state_depth, 320)
+    self.assertAllClose(model._feature_extractor.depth_multipliers, (1.4, 0.35))
+    self.assertTrue(model._feature_extractor.pre_bottleneck)
+    self.assertTrue(model._feature_extractor.low_res)
+    self.assertEqual(model._feature_extractor.interleave_method,
+                     'RANDOM_SKIP_SMALL')
+    model = self.create_eval_model(configs['model'], configs['lstm_model'])
+    # Test architechture type.
+    self.assertIsInstance(model, lstm_ssd_meta_arch.LSTMSSDMetaArch)
+    # Test LSTM configs.
+    self.assertEqual(model.unroll_length, 10)
+    self.assertEqual(model._feature_extractor.lstm_state_depth, 320)
+    self.assertAllClose(model._feature_extractor.depth_multipliers, (1.4, 0.35))
+    self.assertTrue(model._feature_extractor.pre_bottleneck)
+    self.assertTrue(model._feature_extractor.low_res)
+    self.assertEqual(model._feature_extractor.interleave_method, 'SKIP3')
  def test_model_creation_from_invalid_configs(self):
    configs = self.get_model_configs_from_proto()
    # Test model build failure with wrong input configs.
    with self.assertRaises(AttributeError):
-      _ = self.create_model(configs['model'], configs['model'])
+      _ = self.create_train_model(configs['model'], configs['model'])
+    with self.assertRaises(AttributeError):
-    # Test model builder failure with missing configs.
+      _ = self.create_eval_model(configs['model'], configs['model'])
-    with self.assertRaises(TypeError):
-      # pylint: disable=no-value-for-parameter
-      _ = self.create_model(configs['lstm_model'])
 if __name__ == '__main__':

--- a/research/lstm_object_detection/models/lstm_ssd_interleaved_mobilenet_v2_feature_extractor.py
+++ b/research/lstm_object_detection/models/lstm_ssd_interleaved_mobilenet_v2_feature_extractor.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""LSTDInterleavedFeatureExtractor which interleaves multiple MobileNet V2."""
+import tensorflow as tf
+from tensorflow.python.framework import ops as tf_ops
+from lstm_object_detection.lstm import lstm_cells
+from lstm_object_detection.lstm import rnn_decoder
+from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
+from lstm_object_detection.models import mobilenet_defs
+from object_detection.models import feature_map_generators
+from object_detection.utils import ops
+from object_detection.utils import shape_utils
+from nets.mobilenet import mobilenet
+from nets.mobilenet import mobilenet_v2
+slim = tf.contrib.slim
+class LSTMSSDInterleavedMobilenetV2FeatureExtractor(
+    lstm_ssd_meta_arch.LSTMSSDInterleavedFeatureExtractor):
+  """LSTM-SSD Interleaved Feature Extractor using MobilenetV2 features."""
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams_fn,
+               reuse_weights=None,
+               use_explicit_padding=False,
+               use_depthwise=True,
+               override_base_feature_extractor_hyperparams=False):
+    """Interleaved Feature Extractor for LSTD Models with MobileNet v2.
+    Args:
+      is_training: whether the network is in training mode.
+      depth_multiplier: float depth multiplier for feature extractor.
+      min_depth: minimum feature extractor depth.
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
+        and separable_conv2d ops in the layers that are added on top of the
+        base feature extractor.
+      reuse_weights: Whether to reuse variables. Default is None.
+      use_explicit_padding: Whether to use explicit padding when extracting
+        features. Default is False.
+      use_depthwise: Whether to use depthwise convolutions. Default is True.
+      override_base_feature_extractor_hyperparams: Whether to override
+        hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams_fn`.
+    """
+    super(LSTMSSDInterleavedMobilenetV2FeatureExtractor, self).__init__(
+        is_training, depth_multiplier, min_depth, pad_to_multiple,
+        conv_hyperparams_fn, reuse_weights, use_explicit_padding, use_depthwise,
+        override_base_feature_extractor_hyperparams)
+    # RANDOM_SKIP_SMALL means the training policy is random and the small model
+    # does not update state during training.
+    if self._is_training:
+      self._interleave_method = 'RANDOM_SKIP_SMALL'
+    else:
+      self._interleave_method = 'SKIP9'
+    self._flatten_state = False
+    self._scale_state = False
+    self._clip_state = True
+    self._pre_bottleneck = True
+    self._feature_map_layout = {
+        'from_layer': ['layer_19', '', '', '', ''],
+        'layer_depth': [-1, 256, 256, 256, 256],
+        'use_depthwise': self._use_depthwise,
+        'use_explicit_padding': self._use_explicit_padding,
+    }
+    self._low_res = True
+    self._base_network_scope = 'MobilenetV2'
+  def extract_base_features_large(self, preprocessed_inputs):
+    """Extract the large base model features.
+    Variables are created under the scope of <scope>/MobilenetV2_1/
+    Args:
+      preprocessed_inputs: preprocessed input images of shape:
+        [batch, width, height, depth].
+    Returns:
+      net: the last feature map created from the base feature extractor.
+      end_points: a dictionary of feature maps created.
+    """
+    scope_name = self._base_network_scope + '_1'
+    with tf.variable_scope(scope_name, reuse=self._reuse_weights) as base_scope:
+      net, end_points = mobilenet_v2.mobilenet_base(
+          preprocessed_inputs,
+          depth_multiplier=self._depth_multipliers[0],
+          conv_defs=mobilenet_defs.mobilenet_v2_lite_def(
+              is_quantized=self._is_quantized),
+          use_explicit_padding=self._use_explicit_padding,
+          scope=base_scope)
+      return net, end_points
+  def extract_base_features_small(self, preprocessed_inputs):
+    """Extract the small base model features.
+    Variables are created under the scope of <scope>/MobilenetV2_2/
+    Args:
+      preprocessed_inputs: preprocessed input images of shape:
+        [batch, width, height, depth].
+    Returns:
+      net: the last feature map created from the base feature extractor.
+      end_points: a dictionary of feature maps created.
+    """
+    scope_name = self._base_network_scope + '_2'
+    with tf.variable_scope(scope_name, reuse=self._reuse_weights) as base_scope:
+      if self._low_res:
+        size_small = preprocessed_inputs.get_shape().as_list()[1] / 2
+        inputs_small = tf.image.resize_images(preprocessed_inputs,
+                                              [size_small, size_small])
+        # Create end point handle for tflite deployment.
+        with tf.name_scope(None):
+          inputs_small = tf.identity(
+              inputs_small, name='normalized_input_image_tensor_small')
+      else:
+        inputs_small = preprocessed_inputs
+      net, end_points = mobilenet_v2.mobilenet_base(
+          inputs_small,
+          depth_multiplier=self._depth_multipliers[1],
+          conv_defs=mobilenet_defs.mobilenet_v2_lite_def(
+              is_quantized=self._is_quantized, low_res=self._low_res),
+          use_explicit_padding=self._use_explicit_padding,
+          scope=base_scope)
+      return net, end_points
+  def create_lstm_cell(self, batch_size, output_size, state_saver, state_name):
+    """Create the LSTM cell, and initialize state if necessary.
+    Args:
+      batch_size: input batch size.
+      output_size: output size of the lstm cell, [width, height].
+      state_saver: a state saver object with methods `state` and `save_state`.
+      state_name: string, the name to use with the state_saver.
+    Returns:
+      lstm_cell: the lstm cell unit.
+      init_state: initial state representations.
+      step: the step
+    """
+    lstm_cell = lstm_cells.GroupedConvLSTMCell(
+        filter_size=(3, 3),
+        output_size=output_size,
+        num_units=max(self._min_depth, self._lstm_state_depth),
+        is_training=self._is_training,
+        activation=tf.nn.relu6,
+        flatten_state=self._flatten_state,
+        scale_state=self._scale_state,
+        clip_state=self._clip_state,
+        output_bottleneck=True,
+        pre_bottleneck=self._pre_bottleneck,
+        is_quantized=self._is_quantized,
+        visualize_gates=False)
+    if state_saver is None:
+      init_state = lstm_cell.init_state('lstm_state', batch_size, tf.float32)
+      step = None
+    else:
+      step = state_saver.state(state_name + '_step')
+      c = state_saver.state(state_name + '_c')
+      h = state_saver.state(state_name + '_h')
+      c.set_shape([batch_size] + c.get_shape().as_list()[1:])
+      h.set_shape([batch_size] + h.get_shape().as_list()[1:])
+      init_state = (c, h)
+    return lstm_cell, init_state, step
+  def extract_features(self, preprocessed_inputs, state_saver=None,
+                       state_name='lstm_state', unroll_length=10, scope=None):
+    """Extract features from preprocessed inputs.
+    The features include the base network features, lstm features and SSD
+    features, organized in the following name scope:
+    <scope>/MobilenetV2_1/...
+    <scope>/MobilenetV2_2/...
+    <scope>/LSTM/...
+    <scope>/FeatureMap/...
+    Args:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of consecutive frames from video clips.
+      state_saver: A state saver object with methods `state` and `save_state`.
+      state_name: Python string, the name to use with the state_saver.
+      unroll_length: number of steps to unroll the lstm.
+      scope: Scope for the base network of the feature extractor.
+    Returns:
+      feature_maps: a list of tensors where the ith tensor has shape
+        [batch, height_i, width_i, depth_i]
+    Raises:
+      ValueError: if interleave_method not recognized or large and small base
+        network output feature maps of different sizes.
+    """
+    preprocessed_inputs = shape_utils.check_min_image_dim(
+        33, preprocessed_inputs)
+    preprocessed_inputs = ops.pad_to_multiple(
+        preprocessed_inputs, self._pad_to_multiple)
+    batch_size = preprocessed_inputs.shape[0].value / unroll_length
+    batch_axis = 0
+    nets = []
+    # Batch processing of mobilenet features.
+    with slim.arg_scope(mobilenet_v2.training_scope(
+        is_training=self._is_training,
+        bn_decay=0.9997)), \
+        slim.arg_scope([mobilenet.depth_multiplier],
+                       min_depth=self._min_depth, divisible_by=8):
+      # Big model.
+      net, _ = self.extract_base_features_large(preprocessed_inputs)
+      nets.append(net)
+      large_base_feature_shape = net.shape
+      # Small models
+      net, _ = self.extract_base_features_small(preprocessed_inputs)
+      nets.append(net)
+      small_base_feature_shape = net.shape
+      if not (large_base_feature_shape[1] == small_base_feature_shape[1] and
+              large_base_feature_shape[2] == small_base_feature_shape[2]):
+        raise ValueError('Large and Small base network feature map dimension '
+                         'not equal!')
+    with slim.arg_scope(self._conv_hyperparams_fn()):
+      with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope:
+        output_size = (large_base_feature_shape[1], large_base_feature_shape[2])
+        lstm_cell, init_state, step = self.create_lstm_cell(
+            batch_size, output_size, state_saver, state_name)
+        nets_seq = [
+            tf.split(net, unroll_length, axis=batch_axis) for net in nets
+        ]
+        net_seq, states_out = rnn_decoder.multi_input_rnn_decoder(
+            nets_seq,
+            init_state,
+            lstm_cell,
+            step,
+            selection_strategy=self._interleave_method,
+            is_training=self._is_training,
+            pre_bottleneck=self._pre_bottleneck,
+            flatten_state=self._flatten_state,
+            scope=lstm_scope)
+        self._states_out = states_out
+      batcher_ops = None
+      if state_saver is not None:
+        self._step = state_saver.state(state_name + '_step')
+        batcher_ops = [
+            state_saver.save_state(state_name + '_c', states_out[-1][0]),
+            state_saver.save_state(state_name + '_h', states_out[-1][1]),
+            state_saver.save_state(state_name + '_step', self._step + 1)]
+      image_features = {}
+      with tf_ops.control_dependencies(batcher_ops):
+        image_features['layer_19'] = tf.concat(net_seq, 0)
+      # SSD layers.
+      with tf.variable_scope('FeatureMap'):
+        feature_maps = feature_map_generators.multi_resolution_feature_maps(
+            feature_map_layout=self._feature_map_layout,
+            depth_multiplier=self._depth_multiplier,
+            min_depth=self._min_depth,
+            insert_1x1_conv=True,
+            image_features=image_features,
+            pool_residual=True)
+    return feature_maps.values()
--- a/research/lstm_object_detection/models/lstm_ssd_interleaved_mobilenet_v2_feature_extractor_test.py
+++ b/research/lstm_object_detection/models/lstm_ssd_interleaved_mobilenet_v2_feature_extractor_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lstm_ssd_interleaved_mobilenet_v2_feature_extractor."""
+import itertools
+import numpy as np
+import tensorflow as tf
+from lstm_object_detection.models import lstm_ssd_interleaved_mobilenet_v2_feature_extractor
+from object_detection.models import ssd_feature_extractor_test
+slim = tf.contrib.slim
+class LSTMSSDInterleavedMobilenetV2FeatureExtractorTest(
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
+  def _create_feature_extractor(self,
+                                depth_multiplier,
+                                pad_to_multiple,
+                                is_quantized=False):
+    """Constructs a new feature extractor.
+    Args:
+      depth_multiplier: float depth multiplier for feature extractor
+      pad_to_multiple: the nearest multiple to zero pad the input height and
+        width dimensions to.
+      is_quantized: whether to quantize the graph.
+    Returns:
+      an ssd_meta_arch.SSDFeatureExtractor object.
+    """
+    min_depth = 32
+    def conv_hyperparams_fn():
+      with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm), \
+        slim.arg_scope([slim.batch_norm], is_training=False) as sc:
+        return sc
+    feature_extractor = (
+        lstm_ssd_interleaved_mobilenet_v2_feature_extractor
+        .LSTMSSDInterleavedMobilenetV2FeatureExtractor(False, depth_multiplier,
+                                                       min_depth,
+                                                       pad_to_multiple,
+                                                       conv_hyperparams_fn))
+    feature_extractor.lstm_state_depth = int(320 * depth_multiplier)
+    feature_extractor.depth_multipliers = [
+        depth_multiplier, depth_multiplier / 4.0
+    ]
+    feature_extractor.is_quantized = is_quantized
+    return feature_extractor
+  def test_extract_features_returns_correct_shapes_128(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 4, 4, 640),
+                                  (2, 2, 2, 256), (2, 1, 1, 256),
+                                  (2, 1, 1, 256), (2, 1, 1, 256)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+  def test_extract_features_returns_correct_shapes_unroll10(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(10, 4, 4, 640),
+                                  (10, 2, 2, 256), (10, 1, 1, 256),
+                                  (10, 1, 1, 256), (10, 1, 1, 256)]
+    self.check_extract_features_returns_correct_shape(
+        10, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape, unroll_length=10)
+  def test_extract_features_returns_correct_shapes_320(self):
+    image_height = 320
+    image_width = 320
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 10, 10, 640),
+                                  (2, 5, 5, 256), (2, 3, 3, 256),
+                                  (2, 2, 2, 256), (2, 1, 1, 256)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+  def test_extract_features_returns_correct_shapes_enforcing_min_depth(self):
+    image_height = 320
+    image_width = 320
+    depth_multiplier = 0.5**12
+    pad_to_multiple = 1
+    expected_feature_map_shape = [(2, 10, 10, 64),
+                                  (2, 5, 5, 32), (2, 3, 3, 32),
+                                  (2, 2, 2, 32), (2, 1, 1, 32)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+  def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self):
+    image_height = 299
+    image_width = 299
+    depth_multiplier = 1.0
+    pad_to_multiple = 32
+    expected_feature_map_shape = [(2, 10, 10, 640),
+                                  (2, 5, 5, 256), (2, 3, 3, 256),
+                                  (2, 2, 2, 256), (2, 1, 1, 256)]
+    self.check_extract_features_returns_correct_shape(
+        2, image_height, image_width, depth_multiplier, pad_to_multiple,
+        expected_feature_map_shape)
+  def test_preprocess_returns_correct_value_range(self):
+    image_height = 128
+    image_width = 128
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    test_image = np.random.rand(4, image_height, image_width, 3)
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    preprocessed_image = feature_extractor.preprocess(test_image)
+    self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
+  def test_variables_only_created_in_scope(self):
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    scope_names = ['MobilenetV2', 'LSTM', 'FeatureMap']
+    self.check_feature_extractor_variables_under_scopes(
+        depth_multiplier, pad_to_multiple, scope_names)
+  def test_has_fused_batchnorm(self):
+    image_height = 40
+    image_width = 40
+    depth_multiplier = 1
+    pad_to_multiple = 32
+    image_placeholder = tf.placeholder(tf.float32,
+                                       [1, image_height, image_width, 3])
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    preprocessed_image = feature_extractor.preprocess(image_placeholder)
+    _ = feature_extractor.extract_features(preprocessed_image, unroll_length=1)
+    self.assertTrue(any(op.type == 'FusedBatchNorm'
+                        for op in tf.get_default_graph().get_operations()))
+  def test_variables_for_tflite(self):
+    image_height = 40
+    image_width = 40
+    depth_multiplier = 1
+    pad_to_multiple = 32
+    image_placeholder = tf.placeholder(tf.float32,
+                                       [1, image_height, image_width, 3])
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    preprocessed_image = feature_extractor.preprocess(image_placeholder)
+    tflite_unsupported = ['SquaredDifference']
+    _ = feature_extractor.extract_features(preprocessed_image, unroll_length=1)
+    self.assertFalse(any(op.type in tflite_unsupported
+                         for op in tf.get_default_graph().get_operations()))
+  def test_output_nodes_for_tflite(self):
+    image_height = 64
+    image_width = 64
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    image_placeholder = tf.placeholder(tf.float32,
+                                       [1, image_height, image_width, 3])
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    preprocessed_image = feature_extractor.preprocess(image_placeholder)
+    _ = feature_extractor.extract_features(preprocessed_image, unroll_length=1)
+    tflite_nodes = [
+        'raw_inputs/init_lstm_c',
+        'raw_inputs/init_lstm_h',
+        'raw_inputs/base_endpoint',
+        'raw_outputs/lstm_c',
+        'raw_outputs/lstm_h',
+        'raw_outputs/base_endpoint_1',
+        'raw_outputs/base_endpoint_2'
+    ]
+    ops_names = [op.name for op in tf.get_default_graph().get_operations()]
+    for node in tflite_nodes:
+      self.assertTrue(any(node in s for s in ops_names))
+  def test_fixed_concat_nodes(self):
+    image_height = 64
+    image_width = 64
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    image_placeholder = tf.placeholder(tf.float32,
+                                       [1, image_height, image_width, 3])
+    feature_extractor = self._create_feature_extractor(
+        depth_multiplier, pad_to_multiple, is_quantized=True)
+    preprocessed_image = feature_extractor.preprocess(image_placeholder)
+    _ = feature_extractor.extract_features(preprocessed_image, unroll_length=1)
+    concat_nodes = [
+        'MobilenetV2_1/expanded_conv_16/project/Relu6',
+        'MobilenetV2_2/expanded_conv_16/project/Relu6'
+    ]
+    ops_names = [op.name for op in tf.get_default_graph().get_operations()]
+    for node in concat_nodes:
+      self.assertTrue(any(node in s for s in ops_names))
+  def test_lstm_states(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1
+    pad_to_multiple = 1
+    state_channel = 320
+    init_state1 = {
+        'lstm_state_c': tf.zeros(
+            [image_height/32, image_width/32, state_channel]),
+        'lstm_state_h': tf.zeros(
+            [image_height/32, image_width/32, state_channel]),
+        'lstm_state_step': tf.zeros([1])
+    }
+    init_state2 = {
+        'lstm_state_c': tf.random_uniform(
+            [image_height/32, image_width/32, state_channel]),
+        'lstm_state_h': tf.random_uniform(
+            [image_height/32, image_width/32, state_channel]),
+        'lstm_state_step': tf.zeros([1])
+    }
+    seq = {'dummy': tf.random_uniform([2, 1, 1, 1])}
+    stateful_reader1 = tf.contrib.training.SequenceQueueingStateSaver(
+        batch_size=1, num_unroll=1, input_length=2, input_key='',
+        input_sequences=seq, input_context={}, initial_states=init_state1,
+        capacity=1)
+    stateful_reader2 = tf.contrib.training.SequenceQueueingStateSaver(
+        batch_size=1, num_unroll=1, input_length=2, input_key='',
+        input_sequences=seq, input_context={}, initial_states=init_state2,
+        capacity=1)
+    image = tf.random_uniform([1, image_height, image_width, 3])
+    feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                       pad_to_multiple)
+    with tf.variable_scope('zero_state'):
+      feature_maps1 = feature_extractor.extract_features(
+          image, stateful_reader1.next_batch, unroll_length=1)
+    with tf.variable_scope('random_state'):
+      feature_maps2 = feature_extractor.extract_features(
+          image, stateful_reader2.next_batch, unroll_length=1)
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(tf.local_variables_initializer())
+      sess.run(tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS))
+      sess.run([stateful_reader1.prefetch_op, stateful_reader2.prefetch_op])
+      maps1, maps2 = sess.run([feature_maps1, feature_maps2])
+      state = sess.run(stateful_reader1.next_batch.state('lstm_state_c'))
+    # feature maps should be different because states are different
+    self.assertFalse(np.all(np.equal(maps1[0], maps2[0])))
+    # state should no longer be zero after update
+    self.assertTrue(state.any())
+  def check_extract_features_returns_correct_shape(
+      self, batch_size, image_height, image_width, depth_multiplier,
+      pad_to_multiple, expected_feature_map_shapes, unroll_length=1):
+    def graph_fn(image_tensor):
+      feature_extractor = self._create_feature_extractor(depth_multiplier,
+                                                         pad_to_multiple)
+      feature_maps = feature_extractor.extract_features(
+          image_tensor, unroll_length=unroll_length)
+      return feature_maps
+    image_tensor = np.random.rand(batch_size, image_height, image_width,
+                                  3).astype(np.float32)
+    feature_maps = self.execute(graph_fn, [image_tensor])
+    for feature_map, expected_shape in itertools.izip(
+        feature_maps, expected_feature_map_shapes):
+      self.assertAllEqual(feature_map.shape, expected_shape)
+  def check_feature_extractor_variables_under_scopes(
+      self, depth_multiplier, pad_to_multiple, scope_names):
+    g = tf.Graph()
+    with g.as_default():
+      feature_extractor = self._create_feature_extractor(
+          depth_multiplier, pad_to_multiple)
+      preprocessed_inputs = tf.placeholder(tf.float32, (4, 320, 320, 3))
+      feature_extractor.extract_features(
+          preprocessed_inputs, unroll_length=1)
+      variables = g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+      for variable in variables:
+        self.assertTrue(
+            any([
+                variable.name.startswith(scope_name)
+                for scope_name in scope_names
+            ]), 'Variable name: ' + variable.name +
+            ' is not under any provided scopes: ' + ','.join(scope_names))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor.py
+++ b/research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor.py
@@ -13,13 +13,13 @@
 # limitations under the License.
 # ==============================================================================
-"""LSTMFeatureExtractor for MobilenetV1 features."""
+"""LSTMSSDFeatureExtractor for MobilenetV1 features."""
 import tensorflow as tf
 from tensorflow.python.framework import ops as tf_ops
 from lstm_object_detection.lstm import lstm_cells
-from lstm_object_detection.lstm import lstm_meta_arch
 from lstm_object_detection.lstm import rnn_decoder
+from lstm_object_detection.meta_architectures import lstm_ssd_meta_arch
 from object_detection.models import feature_map_generators
 from object_detection.utils import context_manager
 from object_detection.utils import ops
@@ -29,7 +29,8 @@ from nets import mobilenet_v1
 slim = tf.contrib.slim
-class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
+class LSTMSSDMobileNetV1FeatureExtractor(
+    lstm_ssd_meta_arch.LSTMSSDFeatureExtractor):
  """LSTM Feature Extractor using MobilenetV1 features."""
  def __init__(self,
@@ -37,13 +38,13 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
               depth_multiplier,
               min_depth,
               pad_to_multiple,
-               conv_hyperparams,
+               conv_hyperparams_fn,
               reuse_weights=None,
               use_explicit_padding=False,
               use_depthwise=True,
               override_base_feature_extractor_hyperparams=False,
               lstm_state_depth=256):
-    """Initializes instance of MobileNetV1 Feature Extractor for LSTM Models.
+    """Initializes instance of MobileNetV1 Feature Extractor for LSTMSSD Models.
    Args:
      is_training: A boolean whether the network is in training mode.
@@ -51,7 +52,7 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
      min_depth: A number representing minimum feature extractor depth.
      pad_to_multiple: The nearest multiple to zero pad the input height and
        width dimensions to.
-      conv_hyperparams: A function to construct tf slim arg_scope for conv2d
+      conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d
        and separable_conv2d ops in the layers that are added on top of the
        base feature extractor.
      reuse_weights: Whether to reuse variables. Default is None.
@@ -63,9 +64,9 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
        `conv_hyperparams_fn`.
      lstm_state_depth: An integter of the depth of the lstm state.
    """
-    super(LSTMMobileNetV1FeatureExtractor, self).__init__(
+    super(LSTMSSDMobileNetV1FeatureExtractor, self).__init__(
        is_training, depth_multiplier, min_depth, pad_to_multiple,
-        conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise,
+        conv_hyperparams_fn, reuse_weights, use_explicit_padding, use_depthwise,
        override_base_feature_extractor_hyperparams)
    self._feature_map_layout = {
        'from_layer': ['Conv2d_13_pointwise_lstm', '', '', '', ''],
@@ -76,6 +77,37 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
    self._base_network_scope = 'MobilenetV1'
    self._lstm_state_depth = lstm_state_depth
+  def create_lstm_cell(self, batch_size, output_size, state_saver, state_name):
+    """Create the LSTM cell, and initialize state if necessary.
+    Args:
+      batch_size: input batch size.
+      output_size: output size of the lstm cell, [width, height].
+      state_saver: a state saver object with methods `state` and `save_state`.
+      state_name: string, the name to use with the state_saver.
+    Returns:
+      lstm_cell: the lstm cell unit.
+      init_state: initial state representations.
+      step: the step
+    """
+    lstm_cell = lstm_cells.BottleneckConvLSTMCell(
+        filter_size=(3, 3),
+        output_size=output_size,
+        num_units=max(self._min_depth, self._lstm_state_depth),
+        activation=tf.nn.relu6,
+        visualize_gates=False)
+    if state_saver is None:
+      init_state = lstm_cell.init_state(state_name, batch_size, tf.float32)
+      step = None
+    else:
+      step = state_saver.state(state_name + '_step')
+      c = state_saver.state(state_name + '_c')
+      h = state_saver.state(state_name + '_h')
+      init_state = (c, h)
+    return lstm_cell, init_state, step
  def extract_features(self,
                       preprocessed_inputs,
                       state_saver=None,
@@ -126,22 +158,12 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
      with slim.arg_scope(
          [slim.batch_norm], fused=False, is_training=self._is_training):
        # ConvLSTM layers.
+        batch_size = net.shape[0].value / unroll_length
        with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope:
-          lstm_cell = lstm_cells.BottleneckConvLSTMCell(
+          lstm_cell, init_state, _ = self.create_lstm_cell(
-              filter_size=(3, 3),
+              batch_size, (net.shape[1].value, net.shape[2].value), state_saver,
-              output_size=(net.shape[1].value, net.shape[2].value),
+              state_name)
-              num_units=max(self._min_depth, self._lstm_state_depth),
-              activation=tf.nn.relu6,
-              visualize_gates=True)
          net_seq = list(tf.split(net, unroll_length))
-          if state_saver is None:
-            init_state = lstm_cell.init_state(
-                state_name, net.shape[0].value / unroll_length, tf.float32)
-          else:
-            c = state_saver.state('%s_c' % state_name)
-            h = state_saver.state('%s_h' % state_name)
-            init_state = (c, h)
          # Identities added for inputing state tensors externally.
          c_ident = tf.identity(init_state[0], name='lstm_state_in_c')
@@ -157,7 +179,7 @@ class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
            batcher_ops = [
                state_saver.save_state('%s_c' % state_name, states_out[-1][0]),
                state_saver.save_state('%s_h' % state_name, states_out[-1][1]),
-                state_saver.save_state('%s_step' % state_name, self._step - 1)
+                state_saver.save_state('%s_step' % state_name, self._step + 1)
            ]
          with tf_ops.control_dependencies(batcher_ops):
            image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0)

--- a/research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor_test.py
+++ b/research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor_test.py
@@ -42,11 +42,11 @@ class LstmSsdMobilenetV1FeatureExtractorTest(
      use_explicit_padding: A boolean whether to use explicit padding.
    Returns:
-      An lstm_ssd_meta_arch.LSTMMobileNetV1FeatureExtractor object.
+      An lstm_ssd_meta_arch.LSTMSSDMobileNetV1FeatureExtractor object.
    """
    min_depth = 32
    extractor = (
-        feature_extactor.LSTMMobileNetV1FeatureExtractor(
+        feature_extactor.LSTMSSDMobileNetV1FeatureExtractor(
            is_training,
            depth_multiplier,
            min_depth,

--- a/research/lstm_object_detection/models/mobilenet_defs.py
+++ b/research/lstm_object_detection/models/mobilenet_defs.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Definitions for modified MobileNet models used in LSTD."""
+import tensorflow as tf
+from nets import mobilenet_v1
+from nets.mobilenet import conv_blocks as mobilenet_convs
+from nets.mobilenet import mobilenet
+slim = tf.contrib.slim
+def mobilenet_v1_lite_def(depth_multiplier, low_res=False):
+  """Conv definitions for a lite MobileNet v1 model.
+  Args:
+    depth_multiplier: float depth multiplier for MobileNet.
+    low_res: An option of low-res conv input for interleave model.
+  Returns:
+    Array of convolutions.
+  Raises:
+    ValueError: On invalid channels with provided depth multiplier.
+  """
+  conv = mobilenet_v1.Conv
+  sep_conv = mobilenet_v1.DepthSepConv
+  def _find_target_depth(original, depth_multiplier):
+    # Find the target depth such that:
+    # int(target * depth_multiplier) == original
+    pseudo_target = int(original / depth_multiplier)
+    for target in range(pseudo_target - 1, pseudo_target + 2):
+      if int(target * depth_multiplier) == original:
+        return target
+    raise ValueError('Cannot have %d channels with depth multiplier %0.2f' %
+                     (original, depth_multiplier))
+  return [
+      conv(kernel=[3, 3], stride=2, depth=32),
+      sep_conv(kernel=[3, 3], stride=1, depth=64),
+      sep_conv(kernel=[3, 3], stride=2, depth=128),
+      sep_conv(kernel=[3, 3], stride=1, depth=128),
+      sep_conv(kernel=[3, 3], stride=2, depth=256),
+      sep_conv(kernel=[3, 3], stride=1, depth=256),
+      sep_conv(kernel=[3, 3], stride=2, depth=512),
+      sep_conv(kernel=[3, 3], stride=1, depth=512),
+      sep_conv(kernel=[3, 3], stride=1, depth=512),
+      sep_conv(kernel=[3, 3], stride=1, depth=512),
+      sep_conv(kernel=[3, 3], stride=1, depth=512),
+      sep_conv(kernel=[3, 3], stride=1, depth=512),
+      sep_conv(kernel=[3, 3], stride=1 if low_res else 2, depth=1024),
+      sep_conv(
+          kernel=[3, 3],
+          stride=1,
+          depth=int(_find_target_depth(1024, depth_multiplier)))
+  ]
+def mobilenet_v2_lite_def(reduced=False, is_quantized=False, low_res=False):
+  """Conv definitions for a lite MobileNet v2 model.
+  Args:
+    reduced: Determines the scaling factor for expanded conv. If True, a factor
+        of 6 is used. If False, a factor of 3 is used.
+    is_quantized: Whether the model is trained in quantized mode.
+    low_res: Whether the input to the model is of half resolution.
+  Returns:
+    Array of convolutions.
+  """
+  expanded_conv = mobilenet_convs.expanded_conv
+  expand_input = mobilenet_convs.expand_input_by_factor
+  op = mobilenet.op
+  return dict(
+      defaults={
+          # Note: these parameters of batch norm affect the architecture
+          # that's why they are here and not in training_scope.
+          (slim.batch_norm,): {
+              'center': True,
+              'scale': True
+          },
+          (slim.conv2d, slim.fully_connected, slim.separable_conv2d): {
+              'normalizer_fn': slim.batch_norm,
+              'activation_fn': tf.nn.relu6
+          },
+          (expanded_conv,): {
+              'expansion_size': expand_input(6),
+              'split_expansion': 1,
+              'normalizer_fn': slim.batch_norm,
+              'residual': True
+          },
+          (slim.conv2d, slim.separable_conv2d): {
+              'padding': 'SAME'
+          }
+      },
+      spec=[
+          op(slim.conv2d, stride=2, num_outputs=32, kernel_size=[3, 3]),
+          op(expanded_conv,
+             expansion_size=expand_input(1, divisible_by=1),
+             num_outputs=16),
+          op(expanded_conv,
+             expansion_size=(expand_input(3, divisible_by=1)
+                             if reduced else expand_input(6)),
+             stride=2,
+             num_outputs=24),
+          op(expanded_conv,
+             expansion_size=(expand_input(3, divisible_by=1)
+                             if reduced else expand_input(6)),
+             stride=1,
+             num_outputs=24),
+          op(expanded_conv, stride=2, num_outputs=32),
+          op(expanded_conv, stride=1, num_outputs=32),
+          op(expanded_conv, stride=1, num_outputs=32),
+          op(expanded_conv, stride=2, num_outputs=64),
+          op(expanded_conv, stride=1, num_outputs=64),
+          op(expanded_conv, stride=1, num_outputs=64),
+          op(expanded_conv, stride=1, num_outputs=64),
+          op(expanded_conv, stride=1, num_outputs=96),
+          op(expanded_conv, stride=1, num_outputs=96),
+          op(expanded_conv, stride=1, num_outputs=96),
+          op(expanded_conv, stride=1 if low_res else 2, num_outputs=160),
+          op(expanded_conv, stride=1, num_outputs=160),
+          op(expanded_conv, stride=1, num_outputs=160),
+          op(expanded_conv,
+             stride=1,
+             num_outputs=320,
+             project_activation_fn=(tf.nn.relu6
+                                    if is_quantized else tf.identity))
+      ],
+  )
--- a/research/lstm_object_detection/models/mobilenet_defs_test.py
+++ b/research/lstm_object_detection/models/mobilenet_defs_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lstm_object_detection.models.mobilenet_defs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from lstm_object_detection.models import mobilenet_defs
+from nets import mobilenet_v1
+from nets.mobilenet import mobilenet_v2
+class MobilenetV1DefsTest(tf.test.TestCase):
+  def test_mobilenet_v1_lite_def(self):
+    net, _ = mobilenet_v1.mobilenet_v1_base(
+        tf.placeholder(tf.float32, (10, 320, 320, 3)),
+        final_endpoint='Conv2d_13_pointwise',
+        min_depth=8,
+        depth_multiplier=1.0,
+        conv_defs=mobilenet_defs.mobilenet_v1_lite_def(1.0),
+        use_explicit_padding=True,
+        scope='MobilenetV1')
+    self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 1024])
+  def test_mobilenet_v1_lite_def_depthmultiplier_half(self):
+    net, _ = mobilenet_v1.mobilenet_v1_base(
+        tf.placeholder(tf.float32, (10, 320, 320, 3)),
+        final_endpoint='Conv2d_13_pointwise',
+        min_depth=8,
+        depth_multiplier=0.5,
+        conv_defs=mobilenet_defs.mobilenet_v1_lite_def(0.5),
+        use_explicit_padding=True,
+        scope='MobilenetV1')
+    self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 1024])
+  def test_mobilenet_v1_lite_def_depthmultiplier_2x(self):
+    net, _ = mobilenet_v1.mobilenet_v1_base(
+        tf.placeholder(tf.float32, (10, 320, 320, 3)),
+        final_endpoint='Conv2d_13_pointwise',
+        min_depth=8,
+        depth_multiplier=2.0,
+        conv_defs=mobilenet_defs.mobilenet_v1_lite_def(2.0),
+        use_explicit_padding=True,
+        scope='MobilenetV1')
+    self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 1024])
+  def test_mobilenet_v1_lite_def_low_res(self):
+    net, _ = mobilenet_v1.mobilenet_v1_base(
+        tf.placeholder(tf.float32, (10, 320, 320, 3)),
+        final_endpoint='Conv2d_13_pointwise',
+        min_depth=8,
+        depth_multiplier=1.0,
+        conv_defs=mobilenet_defs.mobilenet_v1_lite_def(1.0, low_res=True),
+        use_explicit_padding=True,
+        scope='MobilenetV1')
+    self.assertEqual(net.get_shape().as_list(), [10, 20, 20, 1024])
+class MobilenetV2DefsTest(tf.test.TestCase):
+  def test_mobilenet_v2_lite_def(self):
+    net, features = mobilenet_v2.mobilenet_base(
+        tf.placeholder(tf.float32, (10, 320, 320, 3)),
+        min_depth=8,
+        depth_multiplier=1.0,
+        conv_defs=mobilenet_defs.mobilenet_v2_lite_def(),
+        use_explicit_padding=True,
+        scope='MobilenetV2')
+    self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 320])
+    self._assert_contains_op('MobilenetV2/expanded_conv_16/project/Identity')
+    self.assertEqual(
+        features['layer_3/expansion_output'].get_shape().as_list(),
+        [10, 160, 160, 96])
+    self.assertEqual(
+        features['layer_4/expansion_output'].get_shape().as_list(),
+        [10, 80, 80, 144])
+  def test_mobilenet_v2_lite_def_is_quantized(self):
+    net, _ = mobilenet_v2.mobilenet_base(
+        tf.placeholder(tf.float32, (10, 320, 320, 3)),
+        min_depth=8,
+        depth_multiplier=1.0,
+        conv_defs=mobilenet_defs.mobilenet_v2_lite_def(is_quantized=True),
+        use_explicit_padding=True,
+        scope='MobilenetV2')
+    self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 320])
+    self._assert_contains_op('MobilenetV2/expanded_conv_16/project/Relu6')
+  def test_mobilenet_v2_lite_def_low_res(self):
+    net, _ = mobilenet_v2.mobilenet_base(
+        tf.placeholder(tf.float32, (10, 320, 320, 3)),
+        min_depth=8,
+        depth_multiplier=1.0,
+        conv_defs=mobilenet_defs.mobilenet_v2_lite_def(low_res=True),
+        use_explicit_padding=True,
+        scope='MobilenetV2')
+    self.assertEqual(net.get_shape().as_list(), [10, 20, 20, 320])
+  def test_mobilenet_v2_lite_def_reduced(self):
+    net, features = mobilenet_v2.mobilenet_base(
+        tf.placeholder(tf.float32, (10, 320, 320, 3)),
+        min_depth=8,
+        depth_multiplier=1.0,
+        conv_defs=mobilenet_defs.mobilenet_v2_lite_def(reduced=True),
+        use_explicit_padding=True,
+        scope='MobilenetV2')
+    self.assertEqual(net.get_shape().as_list(), [10, 10, 10, 320])
+    self.assertEqual(
+        features['layer_3/expansion_output'].get_shape().as_list(),
+        [10, 160, 160, 48])
+    self.assertEqual(
+        features['layer_4/expansion_output'].get_shape().as_list(),
+        [10, 80, 80, 72])
+  def _assert_contains_op(self, op_name):
+    op_names = [op.name for op in tf.get_default_graph().get_operations()]
+    self.assertIn(op_name, op_names)
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/lstm_object_detection/protos/input_reader_google.proto
+++ b/research/lstm_object_detection/protos/input_reader_google.proto
 syntax = "proto2";
-package lstm_object_detection.input_readers;
+package lstm_object_detection.protos;
 import "object_detection/protos/input_reader.proto";
@@ -20,9 +20,8 @@ message TFRecordVideoInputReader {
  enum DataType {
    UNSPECIFIED = 0;
-    ANNOTATED_IMAGE = 1;
+    TF_EXAMPLE = 1;
-    TF_EXAMPLE = 2;
+    TF_SEQUENCE_EXAMPLE = 2;
-    TF_SEQUENCE_EXAMPLE = 3;
  }
  optional DataType data_type = 2 [default=TF_SEQUENCE_EXAMPLE];

--- a/research/lstm_object_detection/protos/pipeline.proto
+++ b/research/lstm_object_detection/protos/pipeline.proto
 syntax = "proto2";
-package object_detection.protos;
+package lstm_object_detection.protos;
 import "object_detection/protos/pipeline.proto";
+import "lstm_object_detection/protos/quant_overrides.proto";
-extend TrainEvalPipelineConfig {
+extend object_detection.protos.TrainEvalPipelineConfig {
  optional LstmModel lstm_model = 205743444;
+  optional QuantOverrides quant_overrides = 246059837;
 }
 // Message for extra fields needed for configuring LSTM model.
@@ -18,4 +20,50 @@ message LstmModel {
  // Depth of the lstm feature map.
  optional int32 lstm_state_depth = 3 [default = 256];
+  // Depth multipliers for multiple feature extractors. Used for interleaved
+  // or ensemble model.
+  repeated float depth_multipliers = 4;
+  // Specifies how models are interleaved when multiple feature extractors are
+  // used during training. Must be in ['RANDOM', 'RANDOM_SKIP_SMALL'].
+  optional string train_interleave_method = 5 [default = 'RANDOM'];
+  // Specifies how models are interleaved when multiple feature extractors are
+  // used during training. Must be in ['RANDOM', 'RANDOM_SKIP', 'SKIPK'].
+  optional string eval_interleave_method = 6 [default = 'SKIP9'];
+  // The stride of the lstm state.
+  optional int32 lstm_state_stride = 7 [default = 32];
+  // Whether to flattern LSTM state and output. Note that this is typically
+  // intended only to be modified internally by export_tfmini_lstd_graph_lib
+  // to support flatten state for tfmini/tflite. Do not set this field in
+  // the pipeline config file unless necessary.
+  optional bool flatten_state = 8 [default = false];
+  // Whether to apply bottleneck layer before going into LSTM gates. This
+  // allows multiple feature extractors to use separate bottleneck layers
+  // instead of sharing the same one so that different base model output
+  // feature dimensions are not forced to be the same.
+  // For example:
+  // Model 1 outputs feature map f_1 of depth d_1.
+  // Model 2 outputs feature map f_2 of depth d_2.
+  // Pre-bottlenecking allows lstm input to be either:
+  // conv(concat([f_1, h])) or conv(concat([f_2, h])).
+  optional bool pre_bottleneck = 9 [default = false];
+  // Normalize LSTM state, default false.
+  optional bool scale_state = 10 [default = false];
+  // Clip LSTM state at [0, 6], default true.
+  optional bool clip_state = 11 [default = true];
+  // If the model is in quantized training. This field does NOT need to be set
+  // manually. Instead, it will be overridden by configs in graph_rewriter.
+  optional bool is_quantized = 12 [default = false];
+  // Downsample input image when using the smaller network in interleaved
+  // models, default false.
+  optional bool low_res = 13 [default = false];
 }
--- a/research/lstm_object_detection/protos/quant_overrides.proto
+++ b/research/lstm_object_detection/protos/quant_overrides.proto
+syntax = "proto2";
+package lstm_object_detection.protos;
+// Message to override default quantization behavior.
+message QuantOverrides {
+  repeated QuantConfig quant_configs = 1;
+}
+// Parameters to manually create fake quant ops outside of the generic
+// tensorflow/contrib/quantize/python/quantize.py script. This may be
+// used to override default behaviour or quantize ops not already supported.
+message QuantConfig {
+  // The name of the op to add a fake quant op to.
+  required string op_name = 1;
+  // The name of the fake quant op.
+  required string quant_op_name = 2;
+  // Whether the fake quant op uses fixed ranges. Otherwise, learned moving
+  // average ranges are used.
+  required bool fixed_range = 3 [default = false];
+  // The intitial minimum value of the range.
+  optional float min = 4 [default = -6];
+  // The initial maximum value of the range.
+  optional float max = 5 [default = 6];
+  // Number of steps to delay before quantization takes effect during training.
+  optional int32 delay = 6 [default = 500000];
+  // Number of bits to use for quantizing weights.
+  // Only 8 bit is supported for now.
+  optional int32 weight_bits = 7 [default = 8];
+  // Number of bits to use for quantizing activations.
+  // Only 8 bit is supported for now.
+  optional int32 activation_bits = 8 [default = 8];
+}
--- a/research/lstm_object_detection/trainer.py
+++ b/research/lstm_object_detection/trainer.py
@@ -21,7 +21,6 @@ DetectionModel.
 import functools
 import tensorflow as tf
-from google3.pyglib import logging
 from object_detection.builders import optimizer_builder
 from object_detection.core import standard_fields as fields
@@ -200,7 +199,7 @@ def get_restore_checkpoint_ops(restore_checkpoints, detection_model,
            var_map, restore_checkpoint))
    for var_name, var in available_var_map.iteritems():
      if var in vars_restored:
-        logging.info('Variable %s contained in multiple checkpoints',
+        tf.logging.info('Variable %s contained in multiple checkpoints',
                     var.op.name)
        del available_var_map[var_name]
      else:
@@ -221,7 +220,7 @@ def get_restore_checkpoint_ops(restore_checkpoints, detection_model,
    if available_var_map.keys():
      restorers.append(init_saver)
    else:
-      logging.info('WARNING: Checkpoint %s has no restorable variables',
+      tf.logging.info('WARNING: Checkpoint %s has no restorable variables',
                   restore_checkpoint)
    return restorers

--- a/research/lstm_object_detection/utils/__init__.py
+++ b/research/lstm_object_detection/utils/__init__.py