Merged commit includes the following changes: (#6726)

246873701 by menglong: Missing __init__.py under meta_architectures/ -- 246857392 by menglong: Standardize proto namespace: lstm_object_detection.protos -- 246625127 by menglong: Internal changes. -- 246596481 by menglong: Add License -- 246580605 by menglong: Internal changes -- 246344626 by menglong: Open source interleaved mobilenet v2 model. -- 244893883 by menglong: Introduce multi_input_decoder for interleaved model. -- 244461016 by menglong: Add pre-bottleneck operation to lstm cells to support interleaved model. -- 244052176 by menglong: Update README -- 244020495 by menglong: Add test to rnn_decoder. -- 243704250 by menglong: Duplicate assignment. -- 243091836 by menglong: Move LSTMSSD meta arch into separate folder -- 242900337 by menglong: Modified mobilenet definition for LSTM-SSD -- 242773195 by menglong: Release GroupedConvLSTMCell implementation: https://arxiv.org/abs/1903.10172 -- 242574736 by menglong: Introduce module for quantizated training. -- 242544306 by menglong: lstm_ssd_meta_arch updates, added test rename: - LSTMMetaArch to LSTMSSDMetaArch - LSTMFeatureExtractor to LSTMSSDFeatureExtractor -- 241986236 by menglong: Move lstm quantization utils to 3rd party. -- 225922488 by yinxiao: Training pipeline fixes. -- 224839137 by yinxiao: Issue fix for lstm object detecion sample config. -- 224246947 by menglong: Fix logging module import -- PiperOrigin-RevId: 246873701

Merged commit includes the following changes: (#6726)
246873701 by menglong: Missing __init__.py under meta_architectures/ -- 246857392 by menglong: Standardize proto namespace: lstm_object_detection.protos -- 246625127 by menglong: Internal changes. -- 246596481 by menglong: Add License -- 246580605 by menglong: Internal changes -- 246344626 by menglong: Open source interleaved mobilenet v2 model. -- 244893883 by menglong: Introduce multi_input_decoder for interleaved model. -- 244461016 by menglong: Add pre-bottleneck operation to lstm cells to support interleaved model. -- 244052176 by menglong: Update README -- 244020495 by menglong: Add test to rnn_decoder. -- 243704250 by menglong: Duplicate assignment. -- 243091836 by menglong: Move LSTMSSD meta arch into separate folder -- 242900337 by menglong: Modified mobilenet definition for LSTM-SSD -- 242773195 by menglong: Release GroupedConvLSTMCell implementation: https://arxiv.org/abs/1903.10172 -- 242574736 by menglong: Introduce module for quantizated training. -- 242544306 by menglong: lstm_ssd_meta_arch updates, added test rename: - LSTMMetaArch to LSTMSSDMetaArch - LSTMFeatureExtractor to LSTMSSDFeatureExtractor -- 241986236 by menglong: Move lstm quantization utils to 3rd party. -- 225922488 by yinxiao: Training pipeline fixes. -- 224839137 by yinxiao: Issue fix for lstm object detecion sample config. -- 224246947 by menglong: Fix logging module import -- PiperOrigin-RevId: 246873701
58856e2b · Menglong Zhu · GitHub · f5073f49 · f5073f49 · 58856e2b
Unverified Commit 58856e2b authored May 07, 2019 by Menglong Zhu Committed by GitHub May 07, 2019
20 changed files
--- a/research/lstm_object_detection/README
+++ b/research/lstm_object_detection/README
-Tensorflow mobile video object detection implementation proposed in the following paper:
-Mobile Video Object Detection with Temporally-Aware Feature Maps (CVPR 2018).
-http://openaccess.thecvf.com/content_cvpr_2018/papers/Liu_Mobile_Video_Object_CVPR_2018_paper.pdf
-@article{liu2017mobile,
-  title={Mobile Video Object Detection with Temporally-Aware Feature Maps},
-  author={Liu, Mason and Zhu, Menglong},
-  journal={CVPR},
-  year={2018}
-}
-If you have any questions regarding this codebase, please contact us:
-masonliuw@gmail.com
-yinxiao@google.com
-menglong@google.com
\ No newline at end of file
--- a/research/lstm_object_detection/README.md
+++ b/research/lstm_object_detection/README.md
+# Tensorflow Mobile Video Object Detection
+Tensorflow mobile video object detection implementation proposed in the
+following papers:
+<p align="center">
+  <img src="g3doc/lstm_ssd_intro.png" width=640 height=360>
+</p>
+```
+"Mobile Video Object Detection with Temporally-Aware Feature Maps",
+Liu, Mason and Zhu, Menglong, CVPR 2018.
+```
+\[[link](http://openaccess.thecvf.com/content_cvpr_2018/papers/Liu_Mobile_Video_Object_CVPR_2018_paper.pdf)\]\[[bibtex](
+https://scholar.googleusercontent.com/scholar.bib?q=info:hq5rcMUUXysJ:scholar.google.com/&output=citation&scisig=AAGBfm0AAAAAXLdwXcU5g_wiMQ40EvbHQ9kTyvfUxffh&scisf=4&ct=citation&cd=-1&hl=en)\]
+<p align="center">
+  <img src="g3doc/Interleaved_Intro.png" width=480 height=360>
+</p>
+```
+"Looking Fast and Slow: Memory-Guided Mobile Video Object Detection",
+Liu, Mason and Zhu, Menglong and White, Marie and Li, Yinxiao and Kalenichenko, Dmitry
+```
+\[[link](https://arxiv.org/abs/1903.10172)\]\[[bibtex](
+https://scholar.googleusercontent.com/scholar.bib?q=info:rLqvkztmWYgJ:scholar.google.com/&output=citation&scisig=AAGBfm0AAAAAXLdwNf-LJlm2M1ymQHbq2wYA995MHpJu&scisf=4&ct=citation&cd=-1&hl=en)\]
+## Maintainers
+* masonliuw@gmail.com
+* yinxiao@google.com
+* menglong@google.com
--- a/research/lstm_object_detection/builders/__init__.py
+++ b/research/lstm_object_detection/builders/__init__.py
--- a/research/lstm_object_detection/builders/graph_rewriter_builder.py
+++ b/research/lstm_object_detection/builders/graph_rewriter_builder.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Custom version for quantized training and evaluation functions.
+The main difference between this and the third_party graph_rewriter_builder.py
+is that this version uses experimental_create_training_graph which allows the
+customization of freeze_bn_delay.
+"""
+import re
+import tensorflow as tf
+from tensorflow.contrib.quantize.python import common
+from tensorflow.contrib.quantize.python import input_to_ops
+from tensorflow.contrib.quantize.python import quant_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+def build(graph_rewriter_config,
+          quant_overrides_config=None,
+          is_training=True,
+          is_export=False):
+  """Returns a function that modifies default graph based on options.
+  Args:
+    graph_rewriter_config: graph_rewriter_pb2.GraphRewriter proto.
+    quant_overrides_config: quant_overrides_pb2.QuantOverrides proto.
+    is_training: whether in training or eval mode.
+    is_export: whether exporting the graph.
+  """
+  def graph_rewrite_fn():
+    """Function to quantize weights and activation of the default graph."""
+    if (graph_rewriter_config.quantization.weight_bits != 8 or
+        graph_rewriter_config.quantization.activation_bits != 8):
+      raise ValueError('Only 8bit quantization is supported')
+    graph = tf.get_default_graph()
+    # Insert custom quant ops.
+    if quant_overrides_config is not None:
+      input_to_ops_map = input_to_ops.InputToOps(graph)
+      for q in quant_overrides_config.quant_configs:
+        producer = graph.get_operation_by_name(q.op_name)
+        if producer is None:
+          raise ValueError('Op name does not exist in graph.')
+        context = _get_context_from_op(producer)
+        consumers = input_to_ops_map.ConsumerOperations(producer)
+        if q.fixed_range:
+          _insert_fixed_quant_op(
+              context,
+              q.quant_op_name,
+              producer,
+              consumers,
+              init_min=q.min,
+              init_max=q.max,
+              quant_delay=q.delay if is_training else 0)
+        else:
+          raise ValueError('Learned ranges are not yet supported.')
+    # Quantize the graph by inserting quantize ops for weights and activations
+    if is_training:
+      tf.contrib.quantize.experimental_create_training_graph(
+          input_graph=graph,
+          quant_delay=graph_rewriter_config.quantization.delay,
+          freeze_bn_delay=graph_rewriter_config.quantization.delay)
+    else:
+      tf.contrib.quantize.experimental_create_eval_graph(
+          input_graph=graph,
+          quant_delay=graph_rewriter_config.quantization.delay
+          if not is_export else 0)
+    tf.contrib.layers.summarize_collection('quant_vars')
+  return graph_rewrite_fn
+def _get_context_from_op(op):
+  """Gets the root context name from the op name."""
+  context_re = re.search(r'^(.*)/([^/]+)', op.name)
+  if context_re:
+    return context_re.group(1)
+  return ''
+def _insert_fixed_quant_op(context,
+                           name,
+                           producer,
+                           consumers,
+                           init_min=-6.0,
+                           init_max=6.0,
+                           quant_delay=None):
+  """Adds a fake quant op with fixed ranges.
+  Args:
+    context: The parent scope of the op to be quantized.
+    name: The name of the fake quant op.
+    producer: The producer op to be quantized.
+    consumers: The consumer ops to the producer op.
+    init_min: The minimum range for the fake quant op.
+    init_max: The maximum range for the fake quant op.
+    quant_delay: Number of steps to wait before activating the fake quant op.
+  Raises:
+    ValueError: When producer operation is not directly connected to the
+      consumer operation.
+  """
+  name_prefix = name if not context else context + '/' + name
+  inputs = producer.outputs[0]
+  quant = quant_ops.FixedQuantize(
+      inputs, init_min=init_min, init_max=init_max, scope=name_prefix)
+  if quant_delay and quant_delay > 0:
+    activate_quant = math_ops.greater_equal(
+        common.CreateOrGetQuantizationStep(),
+        quant_delay,
+        name=name_prefix + '/activate_quant')
+    quant = control_flow_ops.cond(
+        activate_quant,
+        lambda: quant,
+        lambda: inputs,
+        name=name_prefix + '/delayed_quant')
+  if consumers:
+    tensors_modified_count = common.RerouteTensor(
+        quant, inputs, can_modify=consumers)
+    # Some operations can have multiple output tensors going to the same
+    # consumer. Since consumers is a set, we need to ensure that
+    # tensors_modified_count is greater than or equal to the length of the set
+    # of consumers.
+    if tensors_modified_count < len(consumers):
+      raise ValueError('No inputs quantized for ops: [%s]' % ', '.join(
+          [consumer.name for consumer in consumers]))
--- a/research/lstm_object_detection/builders/graph_rewriter_builder_test.py
+++ b/research/lstm_object_detection/builders/graph_rewriter_builder_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for graph_rewriter_builder."""
+import mock
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from lstm_object_detection.builders import graph_rewriter_builder
+from lstm_object_detection.protos import quant_overrides_pb2
+from object_detection.protos import graph_rewriter_pb2
+class QuantizationBuilderTest(tf.test.TestCase):
+  def testQuantizationBuilderSetsUpCorrectTrainArguments(self):
+    with mock.patch.object(
+        tf.contrib.quantize,
+        'experimental_create_training_graph') as mock_quant_fn:
+      with mock.patch.object(tf.contrib.layers,
+                             'summarize_collection') as mock_summarize_col:
+        graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter()
+        graph_rewriter_proto.quantization.delay = 10
+        graph_rewriter_proto.quantization.weight_bits = 8
+        graph_rewriter_proto.quantization.activation_bits = 8
+        graph_rewrite_fn = graph_rewriter_builder.build(
+            graph_rewriter_proto, is_training=True)
+        graph_rewrite_fn()
+        _, kwargs = mock_quant_fn.call_args
+        self.assertEqual(kwargs['input_graph'], tf.get_default_graph())
+        self.assertEqual(kwargs['quant_delay'], 10)
+        mock_summarize_col.assert_called_with('quant_vars')
+  def testQuantizationBuilderSetsUpCorrectEvalArguments(self):
+    with mock.patch.object(tf.contrib.quantize,
+                           'experimental_create_eval_graph') as mock_quant_fn:
+      with mock.patch.object(tf.contrib.layers,
+                             'summarize_collection') as mock_summarize_col:
+        graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter()
+        graph_rewriter_proto.quantization.delay = 10
+        graph_rewrite_fn = graph_rewriter_builder.build(
+            graph_rewriter_proto, is_training=False)
+        graph_rewrite_fn()
+        _, kwargs = mock_quant_fn.call_args
+        self.assertEqual(kwargs['input_graph'], tf.get_default_graph())
+        mock_summarize_col.assert_called_with('quant_vars')
+  def testQuantizationBuilderAddsQuantOverride(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      self._buildGraph()
+      quant_overrides_proto = quant_overrides_pb2.QuantOverrides()
+      quant_config = quant_overrides_proto.quant_configs.add()
+      quant_config.op_name = 'test_graph/add_ab'
+      quant_config.quant_op_name = 'act_quant'
+      quant_config.fixed_range = True
+      quant_config.min = 0
+      quant_config.max = 6
+      quant_config.delay = 100
+      graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter()
+      graph_rewriter_proto.quantization.delay = 10
+      graph_rewriter_proto.quantization.weight_bits = 8
+      graph_rewriter_proto.quantization.activation_bits = 8
+      graph_rewrite_fn = graph_rewriter_builder.build(
+          graph_rewriter_proto,
+          quant_overrides_config=quant_overrides_proto,
+          is_training=True)
+      graph_rewrite_fn()
+      act_quant_found = False
+      quant_delay_found = False
+      for op in graph.get_operations():
+        if (quant_config.quant_op_name in op.name and
+            op.type == 'FakeQuantWithMinMaxArgs'):
+          act_quant_found = True
+          min_val = op.get_attr('min')
+          max_val = op.get_attr('max')
+          self.assertEqual(min_val, quant_config.min)
+          self.assertEqual(max_val, quant_config.max)
+        if ('activate_quant' in op.name and
+            quant_config.quant_op_name in op.name and op.type == 'Const'):
+          tensor = op.get_attr('value')
+          if tensor.int64_val[0] == quant_config.delay:
+            quant_delay_found = True
+      self.assertTrue(act_quant_found)
+      self.assertTrue(quant_delay_found)
+  def _buildGraph(self, scope='test_graph'):
+    with ops.name_scope(scope):
+      a = tf.constant(10, dtype=dtypes.float32, name='input_a')
+      b = tf.constant(20, dtype=dtypes.float32, name='input_b')
+      ab = tf.add(a, b, name='add_ab')
+      c = tf.constant(30, dtype=dtypes.float32, name='input_c')
+      abc = tf.multiply(ab, c, name='mul_ab_c')
+      return abc
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/lstm_object_detection/configs/lstm_ssd_mobilenet_v1_imagenet.config
+++ b/research/lstm_object_detection/configs/lstm_ssd_mobilenet_v1_imagenet.config
@@ -22,7 +22,7 @@
 model {
  ssd {
-    num_classes: 30
+    num_classes: 30  # Num of class for imagenet vid dataset.
    box_coder {
      faster_rcnn_box_coder {
        y_scale: 10.0
@@ -197,9 +197,9 @@ train_input_reader: {
  min_after_dequeue: 4
  label_map_path: "path/to/label_map"
  external_input_reader {
-    [lstm_object_detection.input_readers.GoogleInputReader.google_input_reader] {
+    [lstm_object_detection.protos.GoogleInputReader.google_input_reader] {
      tf_record_video_input_reader: {
-        input_path: "your/cns/path"
+        input_path: "path/to/sequence_example/data"
        data_type: TF_SEQUENCE_EXAMPLE
        video_length: 4
      }
@@ -208,7 +208,7 @@ train_input_reader: {
 }
 eval_config: {
-  metrics_set: "coco_evaluation_last_frame"
+  metrics_set: "coco_evaluation_all_frames"
  use_moving_averages: true
  min_score_threshold: 0.5
  max_num_boxes_to_visualize: 300
@@ -219,9 +219,9 @@ eval_config: {
 eval_input_reader: {
  label_map_path: "path/to/label_map"
  external_input_reader {
-    [lstm_object_detection.input_readers.GoogleInputReader.google_input_reader] {
+    [lstm_object_detection.protos.GoogleInputReader.google_input_reader] {
      tf_record_video_input_reader: {
-        input_path: "your/cns/path"
+        input_path: "path/to/sequence_example/data"
        data_type: TF_SEQUENCE_EXAMPLE
        video_length: 4
      }

--- a/research/lstm_object_detection/evaluator.py
+++ b/research/lstm_object_detection/evaluator.py
@@ -20,7 +20,6 @@ DetectionModel.
 """
-import logging
 import tensorflow as tf
 from lstm_object_detection.metrics import coco_evaluation_all_frames
 from object_detection import eval_util
@@ -215,7 +214,7 @@ def evaluate(create_input_dict_fn,
  model = create_model_fn()
  if eval_config.ignore_groundtruth and not eval_config.export_path:
-    logging.fatal('If ignore_groundtruth=True then an export_path is '
+    tf.logging.fatal('If ignore_groundtruth=True then an export_path is '
                  'required. Aborting!!!')
  tensor_dicts = _extract_prediction_tensors(
@@ -252,14 +251,14 @@ def evaluate(create_input_dict_fn,
        third_party eval_util.py.
    """
    if batch_index % 10 == 0:
-      logging.info('Running eval ops batch %d', batch_index)
+      tf.logging.info('Running eval ops batch %d', batch_index)
    if not losses_dict:
      losses_dict = {}
    try:
      result_dicts, result_losses_dict = sess.run([tensor_dicts, losses_dict])
      counters['success'] += 1
    except tf.errors.InvalidArgumentError:
-      logging.info('Skipping image')
+      tf.logging.info('Skipping image')
      counters['skipped'] += 1
      return {}
    num_images = len(tensor_dicts)

--- a/research/lstm_object_detection/g3doc/Interleaved_Intro.png
+++ b/research/lstm_object_detection/g3doc/Interleaved_Intro.png
--- a/research/lstm_object_detection/g3doc/lstm_ssd_intro.png
+++ b/research/lstm_object_detection/g3doc/lstm_ssd_intro.png
--- a/research/lstm_object_detection/inputs/__init__.py
+++ b/research/lstm_object_detection/inputs/__init__.py
--- a/research/lstm_object_detection/inputs/seq_dataset_builder.py
+++ b/research/lstm_object_detection/inputs/seq_dataset_builder.py
@@ -23,7 +23,6 @@ Detection configuration framework, they should define their own builder function
 that wraps the build function.
 """
 import tensorflow as tf
-import tensorflow.google as google_tf
 from tensorflow.contrib.training.python.training import sequence_queueing_state_saver as sqss
 from lstm_object_detection.inputs import tf_sequence_example_decoder
 from lstm_object_detection.protos import input_reader_google_pb2
@@ -116,12 +115,12 @@ def build(input_reader_config,
                     'input_reader_pb2.InputReader.')
  external_reader_config = input_reader_config.external_input_reader
-  google_input_reader_config = external_reader_config.Extensions[
+  external_input_reader_config = external_reader_config.Extensions[
      input_reader_google_pb2.GoogleInputReader.google_input_reader]
-  input_reader_type = google_input_reader_config.WhichOneof('input_reader')
+  input_reader_type = external_input_reader_config.WhichOneof('input_reader')
  if input_reader_type == 'tf_record_video_input_reader':
-    config = google_input_reader_config.tf_record_video_input_reader
+    config = external_input_reader_config.tf_record_video_input_reader
    reader_type_class = tf.TFRecordReader
  else:
    raise ValueError(

--- a/research/lstm_object_detection/inputs/seq_dataset_builder_test.py
+++ b/research/lstm_object_detection/inputs/seq_dataset_builder_test.py
@@ -20,7 +20,6 @@ import numpy as np
 import tensorflow as tf
 from google.protobuf import text_format
-from google3.testing.pybase import parameterized
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from lstm_object_detection.inputs import seq_dataset_builder
@@ -32,7 +31,7 @@ from object_detection.protos import pipeline_pb2
 from object_detection.protos import preprocessor_pb2
-class DatasetBuilderTest(parameterized.TestCase):
+class DatasetBuilderTest(tf.test.TestCase):
  def _create_tf_record(self):
    path = os.path.join(self.get_temp_dir(), 'tfrecord')
@@ -104,7 +103,7 @@ class DatasetBuilderTest(parameterized.TestCase):
    """
    model_text_proto = """
-    [object_detection.protos.lstm_model] {
+    [lstm_object_detection.protos.lstm_model] {
      train_unroll_length: 4
      eval_unroll_length: 4
    }
@@ -211,7 +210,7 @@ class DatasetBuilderTest(parameterized.TestCase):
  def _get_input_proto(self, input_reader):
    return """
        external_input_reader {
-          [lstm_object_detection.input_readers.GoogleInputReader.google_input_reader] {
+          [lstm_object_detection.protos.GoogleInputReader.google_input_reader] {
            %s: {
              input_path: '{0}'
              data_type: TF_SEQUENCE_EXAMPLE
@@ -221,11 +220,11 @@ class DatasetBuilderTest(parameterized.TestCase):
        }
      """ % input_reader
-  @parameterized.named_parameters(('tf_record', 'tf_record_video_input_reader'))
+  def test_video_input_reader(self):
-  def test_video_input_reader(self, video_input_type):
    input_reader_proto = input_reader_pb2.InputReader()
    text_format.Merge(
-        self._get_input_proto(video_input_type), input_reader_proto)
+        self._get_input_proto('tf_record_video_input_reader'),
+        input_reader_proto)
    configs = self._get_model_configs_from_proto()
    tensor_dict = seq_dataset_builder.build(

--- a/research/lstm_object_detection/inputs/tf_sequence_example_decoder.py
+++ b/research/lstm_object_detection/inputs/tf_sequence_example_decoder.py
@@ -17,8 +17,6 @@
 A decoder to decode string tensors containing serialized
 tensorflow.SequenceExample protos.
-TODO(yinxiao): When TensorFlow object detection API officially supports
-tensorflow.SequenceExample, merge this decoder.
 """
 import tensorflow as tf
 from object_detection.core import data_decoder

--- a/research/lstm_object_detection/lstm/lstm_cells.py
+++ b/research/lstm_object_detection/lstm/lstm_cells.py
--- a/research/lstm_object_detection/lstm/lstm_cells_test.py
+++ b/research/lstm_object_detection/lstm/lstm_cells_test.py
@@ -62,7 +62,7 @@ class BottleneckConvLstmCellsTest(tf.test.TestCase):
        filter_size=filter_size,
        output_size=output_size,
        num_units=num_units,
-        flattened_state=True)
+        flatten_state=True)
    init_state = cell.init_state(
        state_name, batch_size, dtype, learned_state)
    output, state_tuple = cell(inputs, init_state)
@@ -138,6 +138,275 @@ class BottleneckConvLstmCellsTest(tf.test.TestCase):
    self.assertAllEqual([4, 10, 10, 15], init_c.shape.as_list())
    self.assertAllEqual([4, 10, 10, 15], init_h.shape.as_list())
+  def test_unroll(self):
+    filter_size = [3, 3]
+    output_size = [10, 10]
+    num_units = 15
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    unroll = 10
+    learned_state = False
+    inputs = tf.zeros([4, 10, 10, 3], dtype=tf.float32)
+    cell = lstm_cells.BottleneckConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units)
+    state = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    for step in range(unroll):
+      output, state = cell(inputs, state)
+    self.assertAllEqual([4, 10, 10, 15], output.shape.as_list())
+    self.assertAllEqual([4, 10, 10, 15], state[0].shape.as_list())
+    self.assertAllEqual([4, 10, 10, 15], state[1].shape.as_list())
+  def test_prebottleneck(self):
+    filter_size = [3, 3]
+    output_size = [10, 10]
+    num_units = 15
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    unroll = 10
+    learned_state = False
+    inputs_large = tf.zeros([4, 10, 10, 5], dtype=tf.float32)
+    inputs_small = tf.zeros([4, 10, 10, 3], dtype=tf.float32)
+    cell = lstm_cells.BottleneckConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units,
+        pre_bottleneck=True)
+    state = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    for step in range(unroll):
+      if step % 2 == 0:
+        inputs = cell.pre_bottleneck(inputs_large, state[1], 0)
+      else:
+        inputs = cell.pre_bottleneck(inputs_small, state[1], 1)
+      output, state = cell(inputs, state)
+    self.assertAllEqual([4, 10, 10, 15], output.shape.as_list())
+    self.assertAllEqual([4, 10, 10, 15], state[0].shape.as_list())
+    self.assertAllEqual([4, 10, 10, 15], state[1].shape.as_list())
+  def test_flatten_state(self):
+    filter_size = [3, 3]
+    output_size = [10, 10]
+    num_units = 15
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    unroll = 10
+    learned_state = False
+    inputs_large = tf.zeros([4, 10, 10, 5], dtype=tf.float32)
+    inputs_small = tf.zeros([4, 10, 10, 3], dtype=tf.float32)
+    cell = lstm_cells.BottleneckConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units,
+        pre_bottleneck=True,
+        flatten_state=True)
+    state = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    for step in range(unroll):
+      if step % 2 == 0:
+        inputs = cell.pre_bottleneck(inputs_large, state[1], 0)
+      else:
+        inputs = cell.pre_bottleneck(inputs_small, state[1], 1)
+      output, state = cell(inputs, state)
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      output_result, state_result = sess.run([output, state])
+      self.assertAllEqual((4, 10, 10, 15), output_result.shape)
+      self.assertAllEqual((4, 10*10*15), state_result[0].shape)
+      self.assertAllEqual((4, 10*10*15), state_result[1].shape)
+class GroupedConvLstmCellsTest(tf.test.TestCase):
+  def test_run_lstm_cell(self):
+    filter_size = [3, 3]
+    output_size = [10, 10]
+    num_units = 16
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    learned_state = False
+    inputs = tf.zeros([4, 10, 10, 3], dtype=tf.float32)
+    cell = lstm_cells.GroupedConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units,
+        is_training=True)
+    init_state = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    output, state_tuple = cell(inputs, init_state)
+    self.assertAllEqual([4, 10, 10, 16], output.shape.as_list())
+    self.assertAllEqual([4, 10, 10, 16], state_tuple[0].shape.as_list())
+    self.assertAllEqual([4, 10, 10, 16], state_tuple[1].shape.as_list())
+  def test_run_lstm_cell_with_output_bottleneck(self):
+    filter_size = [3, 3]
+    output_dim = 10
+    output_size = [output_dim] * 2
+    num_units = 16
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    learned_state = False
+    inputs = tf.zeros([batch_size, output_dim, output_dim, 3], dtype=tf.float32)
+    cell = lstm_cells.GroupedConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units,
+        is_training=True,
+        output_bottleneck=True)
+    init_state = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    output, state_tuple = cell(inputs, init_state)
+    self.assertAllEqual([4, 10, 10, 32], output.shape.as_list())
+    self.assertAllEqual([4, 10, 10, 16], state_tuple[0].shape.as_list())
+    self.assertAllEqual([4, 10, 10, 16], state_tuple[1].shape.as_list())
+  def test_get_init_state(self):
+    filter_size = [3, 3]
+    output_dim = 10
+    output_size = [output_dim] * 2
+    num_units = 16
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    learned_state = False
+    cell = lstm_cells.GroupedConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units,
+        is_training=True)
+    init_c, init_h = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    self.assertEqual(tf.float32, init_c.dtype)
+    self.assertEqual(tf.float32, init_h.dtype)
+    with self.test_session() as sess:
+      init_c_res, init_h_res = sess.run([init_c, init_h])
+      self.assertAllClose(np.zeros((4, 10, 10, 16)), init_c_res)
+      self.assertAllClose(np.zeros((4, 10, 10, 16)), init_h_res)
+  def test_get_init_learned_state(self):
+    filter_size = [3, 3]
+    output_size = [10, 10]
+    num_units = 16
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    learned_state = True
+    cell = lstm_cells.GroupedConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units,
+        is_training=True)
+    init_c, init_h = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    self.assertEqual(tf.float32, init_c.dtype)
+    self.assertEqual(tf.float32, init_h.dtype)
+    self.assertAllEqual([4, 10, 10, 16], init_c.shape.as_list())
+    self.assertAllEqual([4, 10, 10, 16], init_h.shape.as_list())
+  def test_unroll(self):
+    filter_size = [3, 3]
+    output_size = [10, 10]
+    num_units = 16
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    unroll = 10
+    learned_state = False
+    inputs = tf.zeros([4, 10, 10, 3], dtype=tf.float32)
+    cell = lstm_cells.GroupedConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units,
+        is_training=True)
+    state = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    for step in range(unroll):
+      output, state = cell(inputs, state)
+    self.assertAllEqual([4, 10, 10, 16], output.shape.as_list())
+    self.assertAllEqual([4, 10, 10, 16], state[0].shape.as_list())
+    self.assertAllEqual([4, 10, 10, 16], state[1].shape.as_list())
+  def test_prebottleneck(self):
+    filter_size = [3, 3]
+    output_size = [10, 10]
+    num_units = 16
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    unroll = 10
+    learned_state = False
+    inputs_large = tf.zeros([4, 10, 10, 5], dtype=tf.float32)
+    inputs_small = tf.zeros([4, 10, 10, 3], dtype=tf.float32)
+    cell = lstm_cells.GroupedConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units,
+        is_training=True,
+        pre_bottleneck=True)
+    state = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    for step in range(unroll):
+      if step % 2 == 0:
+        inputs = cell.pre_bottleneck(inputs_large, state[1], 0)
+      else:
+        inputs = cell.pre_bottleneck(inputs_small, state[1], 1)
+      output, state = cell(inputs, state)
+    self.assertAllEqual([4, 10, 10, 16], output.shape.as_list())
+    self.assertAllEqual([4, 10, 10, 16], state[0].shape.as_list())
+    self.assertAllEqual([4, 10, 10, 16], state[1].shape.as_list())
+  def test_flatten_state(self):
+    filter_size = [3, 3]
+    output_size = [10, 10]
+    num_units = 16
+    state_name = 'lstm_state'
+    batch_size = 4
+    dtype = tf.float32
+    unroll = 10
+    learned_state = False
+    inputs_large = tf.zeros([4, 10, 10, 5], dtype=tf.float32)
+    inputs_small = tf.zeros([4, 10, 10, 3], dtype=tf.float32)
+    cell = lstm_cells.GroupedConvLSTMCell(
+        filter_size=filter_size,
+        output_size=output_size,
+        num_units=num_units,
+        is_training=True,
+        pre_bottleneck=True,
+        flatten_state=True)
+    state = cell.init_state(
+        state_name, batch_size, dtype, learned_state)
+    for step in range(unroll):
+      if step % 2 == 0:
+        inputs = cell.pre_bottleneck(inputs_large, state[1], 0)
+      else:
+        inputs = cell.pre_bottleneck(inputs_small, state[1], 1)
+      output, state = cell(inputs, state)
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      output_result, state_result = sess.run([output, state])
+      self.assertAllEqual((4, 10, 10, 16), output_result.shape)
+      self.assertAllEqual((4, 10*10*16), state_result[0].shape)
+      self.assertAllEqual((4, 10*10*16), state_result[1].shape)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/lstm_object_detection/lstm/rnn_decoder.py
+++ b/research/lstm_object_detection/lstm/rnn_decoder.py
@@ -15,7 +15,7 @@
 """Custom RNN decoder."""
-from tensorflow.python.ops import variable_scope
+import tensorflow as tf
 def rnn_decoder(decoder_inputs,
@@ -23,7 +23,7 @@ def rnn_decoder(decoder_inputs,
                cell,
                loop_function=None,
                scope=None):
-  """RNN decoder for the sequence-to-sequence model.
+  """RNN decoder for the LSTM-SSD model.
  This decoder returns a list of all states, rather than only the final state.
  Args:
@@ -43,24 +43,205 @@ def rnn_decoder(decoder_inputs,
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 4D Tensors with
        shape [batch_size x output_size] containing generated outputs.
-      state: A list of the same length as decoder_inputs of the state of each
+      states: A list of the same length as decoder_inputs of the state of each
        cell at each time-step. It is a 2D Tensor of shape
        [batch_size x cell.state_size].
  """
-  with variable_scope.variable_scope(scope or 'rnn_decoder'):
+  with tf.variable_scope(scope or 'rnn_decoder'):
-    state = initial_state
+    state_tuple = initial_state
    outputs = []
    states = []
    prev = None
-    for i, decoder_input in enumerate(decoder_inputs):
+    for local_step, decoder_input in enumerate(decoder_inputs):
      if loop_function is not None and prev is not None:
-        with variable_scope.variable_scope('loop_function', reuse=True):
+        with tf.variable_scope('loop_function', reuse=True):
-          decoder_input = loop_function(prev, i)
+          decoder_input = loop_function(prev, local_step)
-      if i > 0:
+      output, state_tuple = cell(decoder_input, state_tuple)
-        variable_scope.get_variable_scope().reuse_variables()
-      output, state = cell(decoder_input, state)
      outputs.append(output)
-      states.append(state)
+      states.append(state_tuple)
      if loop_function is not None:
        prev = output
  return outputs, states
+def multi_input_rnn_decoder(decoder_inputs,
+                            initial_state,
+                            cell,
+                            sequence_step,
+                            selection_strategy='RANDOM',
+                            is_training=None,
+                            is_quantized=False,
+                            preprocess_fn_list=None,
+                            pre_bottleneck=False,
+                            flatten_state=False,
+                            scope=None):
+  """RNN decoder for the Interleaved LSTM-SSD model.
+  This decoder takes multiple sequences of inputs and selects the input to feed
+  to the rnn at each timestep using its selection_strategy, which can be random,
+  learned, or deterministic.
+  This decoder returns a list of all states, rather than only the final state.
+  Args:
+    decoder_inputs: A list of lists of 2D Tensors [batch_size x input_size].
+    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    sequence_step: Tensor [batch_size] of the step number of the first elements
+      in the sequence.
+    selection_strategy: Method for picking the decoder_input to use at each
+      timestep. Must be 'RANDOM', 'SKIPX' for integer X,  where X is the number
+      of times to use the second input before using the first.
+    is_training: boolean, whether the network is training. When using learned
+      selection, attempts exploration if training.
+    is_quantized: flag to enable/disable quantization mode.
+    preprocess_fn_list: List of functions accepting two tensor arguments: one
+      timestep of decoder_inputs and the lstm state. If not None,
+      decoder_inputs[i] will be updated with preprocess_fn[i] at the start of
+      each timestep.
+    pre_bottleneck: if True, use separate bottleneck weights for each sequence.
+      Useful when input sequences have differing numbers of channels. Final
+      bottlenecks will have the same dimension.
+    flatten_state: Whether the LSTM state is flattened.
+    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 2D Tensors with
+        shape [batch_size x output_size] containing generated outputs.
+      states: A list of the same length as decoder_inputs of the state of each
+        cell at each time-step. It is a 2D Tensor of shape
+        [batch_size x cell.state_size].
+  Raises:
+    ValueError: If selection_strategy is not recognized or unexpected unroll
+      length.
+  """
+  if flatten_state and len(decoder_inputs[0]) > 1:
+    raise ValueError('In export mode, unroll length should not be more than 1')
+  with tf.variable_scope(scope or 'rnn_decoder'):
+    state_tuple = initial_state
+    outputs = []
+    states = []
+    batch_size = decoder_inputs[0][0].shape[0].value
+    num_sequences = len(decoder_inputs)
+    sequence_length = len(decoder_inputs[0])
+    for local_step in range(sequence_length):
+      for sequence_index in range(num_sequences):
+        if preprocess_fn_list is not None:
+          decoder_inputs[sequence_index][local_step] = (
+              preprocess_fn_list[sequence_index](
+                  decoder_inputs[sequence_index][local_step], state_tuple[0]))
+        if pre_bottleneck:
+          decoder_inputs[sequence_index][local_step] = cell.pre_bottleneck(
+              inputs=decoder_inputs[sequence_index][local_step],
+              state=state_tuple[1],
+              input_index=sequence_index)
+      action = generate_action(selection_strategy, local_step, sequence_step,
+                               [batch_size, 1, 1, 1])
+      inputs, _ = select_inputs(decoder_inputs, action, local_step)
+      # Mark base network endpoints under raw_inputs/
+      with tf.name_scope(None):
+        inputs = tf.identity(inputs, 'raw_inputs/base_endpoint')
+      output, state_tuple_out = cell(inputs, state_tuple)
+      state_tuple = select_state(state_tuple, state_tuple_out, action)
+      outputs.append(output)
+      states.append(state_tuple)
+  return outputs, states
+def generate_action(selection_strategy, local_step, sequence_step,
+                    action_shape):
+  """Generate current (binary) action based on selection strategy.
+  Args:
+    selection_strategy: Method for picking the decoder_input to use at each
+      timestep. Must be 'RANDOM', 'SKIPX' for integer X,  where X is the number
+      of times to use the second input before using the first.
+    local_step: Tensor [batch_size] of the step number within the current
+      unrolled batch.
+    sequence_step: Tensor [batch_size] of the step number of the first elements
+      in the sequence.
+    action_shape: The shape of action tensor to be generated.
+  Returns:
+    A tensor of shape action_shape, each element is an individual action.
+  Raises:
+    ValueError: if selection_strategy is not supported or if 'SKIP' is not
+      followed by numerics.
+  """
+  if selection_strategy.startswith('RANDOM'):
+    action = tf.random.uniform(action_shape, maxval=2, dtype=tf.int32)
+    action = tf.minimum(action, 1)
+    # First step always runs large network.
+    if local_step == 0 and sequence_step is not None:
+      action *= tf.minimum(
+          tf.reshape(tf.cast(sequence_step, tf.int32), action_shape), 1)
+  elif selection_strategy.startswith('SKIP'):
+    inter_count = int(selection_strategy[4:])
+    if local_step % (inter_count + 1) == 0:
+      action = tf.zeros(action_shape)
+    else:
+      action = tf.ones(action_shape)
+  else:
+    raise ValueError('Selection strategy %s not recognized' %
+                     selection_strategy)
+  return tf.cast(action, tf.int32)
+def select_inputs(decoder_inputs, action, local_step, get_alt_inputs=False):
+  """Selects sequence from decoder_inputs based on 1D actions.
+  Given multiple input batches, creates a single output batch by
+  selecting from the action[i]-ith input for the i-th batch element.
+  Args:
+    decoder_inputs: A 2-D list of tensor inputs.
+    action: A tensor of shape [batch_size]. Each element corresponds to an index
+      of decoder_inputs to choose.
+    step: The current timestep.
+    get_alt_inputs: Whether the non-chosen inputs should also be returned.
+  Returns:
+    The constructed output. Also outputs the elements that were not chosen
+    if get_alt_inputs is True, otherwise None.
+  Raises:
+    ValueError: if the decoder inputs contains other than two sequences.
+  """
+  num_seqs = len(decoder_inputs)
+  if not num_seqs == 2:
+    raise ValueError('Currently only supports two sets of inputs.')
+  stacked_inputs = tf.stack(
+      [decoder_inputs[seq_index][local_step] for seq_index in range(num_seqs)],
+      axis=-1)
+  action_index = tf.one_hot(action, num_seqs)
+  inputs = tf.reduce_sum(stacked_inputs * action_index, axis=-1)
+  inputs_alt = None
+  # Only works for 2 models.
+  if get_alt_inputs:
+    # Reverse of action_index.
+    action_index_alt = tf.one_hot(action, num_seqs, on_value=0.0, off_value=1.0)
+    inputs_alt = tf.reduce_sum(stacked_inputs * action_index_alt, axis=-1)
+  return inputs, inputs_alt
+def select_state(previous_state, new_state, action):
+  """Select state given action.
+  Currently only supports binary action. If action is 0, it means the state is
+  generated from the large model, and thus we will update the state. Otherwise,
+  if the action is 1, it means the state is generated from the small model, and
+  in interleaved model, we skip this state update.
+  Args:
+    previous_state: A state tuple representing state from previous step.
+    new_state: A state tuple representing newly computed state.
+    action: A tensor the same shape as state.
+  Returns:
+    A state tuple selected based on the given action.
+  """
+  action = tf.cast(action, tf.float32)
+  state_c = previous_state[0] * action + new_state[0] * (1 - action)
+  state_h = previous_state[1] * action + new_state[1] * (1 - action)
+  return (state_c, state_h)
--- a/research/lstm_object_detection/lstm/rnn_decoder_test.py
+++ b/research/lstm_object_detection/lstm/rnn_decoder_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lstm_object_detection.lstm.rnn_decoder."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+import numpy as np
+from lstm_object_detection.lstm import rnn_decoder
+class MockRnnCell(tf.contrib.rnn.RNNCell):
+  def __init__(self, input_size, num_units):
+    self._input_size = input_size
+    self._num_units = num_units
+    self._filter_size = [3, 3]
+  def __call__(self, inputs, state_tuple):
+    outputs = tf.concat([inputs, state_tuple[0]], axis=3)
+    new_state_tuple = (tf.multiply(state_tuple[0], 2), state_tuple[1])
+    return outputs, new_state_tuple
+  def state_size(self):
+    return self._num_units
+  def output_size(self):
+    return self._input_size + self._num_units
+  def pre_bottleneck(self, inputs, state, input_index):
+    with tf.variable_scope('bottleneck_%d' % input_index, reuse=tf.AUTO_REUSE):
+      inputs = tf.contrib.layers.separable_conv2d(
+          tf.concat([inputs, state], 3),
+          self._input_size,
+          self._filter_size,
+          depth_multiplier=1,
+          activation_fn=tf.nn.relu6,
+          normalizer_fn=None)
+    return inputs
+class RnnDecoderTest(tf.test.TestCase):
+  def test_rnn_decoder_single_unroll(self):
+    batch_size = 2
+    num_unroll = 1
+    num_units = 64
+    width = 8
+    height = 10
+    input_channels = 128
+    initial_state = tf.random_normal((batch_size, width, height, num_units))
+    inputs = tf.random_normal([batch_size, width, height, input_channels])
+    rnn_cell = MockRnnCell(input_channels, num_units)
+    outputs, states = rnn_decoder.rnn_decoder(
+        decoder_inputs=[inputs] * num_unroll,
+        initial_state=(initial_state, initial_state),
+        cell=rnn_cell)
+    self.assertEqual(len(outputs), num_unroll)
+    self.assertEqual(len(states), num_unroll)
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      results = sess.run((outputs, states, inputs, initial_state))
+      outputs_results = results[0]
+      states_results = results[1]
+      inputs_results = results[2]
+      initial_states_results = results[3]
+      self.assertEqual(outputs_results[0].shape,
+                       (batch_size, width, height, input_channels + num_units))
+      self.assertAllEqual(
+          outputs_results[0],
+          np.concatenate((inputs_results, initial_states_results), axis=3))
+      self.assertEqual(states_results[0][0].shape,
+                       (batch_size, width, height, num_units))
+      self.assertEqual(states_results[0][1].shape,
+                       (batch_size, width, height, num_units))
+      self.assertAllEqual(states_results[0][0],
+                          np.multiply(initial_states_results, 2.0))
+      self.assertAllEqual(states_results[0][1], initial_states_results)
+  def test_rnn_decoder_multiple_unroll(self):
+    batch_size = 2
+    num_unroll = 3
+    num_units = 64
+    width = 8
+    height = 10
+    input_channels = 128
+    initial_state = tf.random_normal((batch_size, width, height, num_units))
+    inputs = tf.random_normal([batch_size, width, height, input_channels])
+    rnn_cell = MockRnnCell(input_channels, num_units)
+    outputs, states = rnn_decoder.rnn_decoder(
+        decoder_inputs=[inputs] * num_unroll,
+        initial_state=(initial_state, initial_state),
+        cell=rnn_cell)
+    self.assertEqual(len(outputs), num_unroll)
+    self.assertEqual(len(states), num_unroll)
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      results = sess.run((outputs, states, inputs, initial_state))
+      outputs_results = results[0]
+      states_results = results[1]
+      inputs_results = results[2]
+      initial_states_results = results[3]
+      for i in range(num_unroll):
+        previous_state = ([initial_states_results, initial_states_results]
+                          if i == 0 else states_results[i - 1])
+        self.assertEqual(
+            outputs_results[i].shape,
+            (batch_size, width, height, input_channels + num_units))
+        self.assertAllEqual(
+            outputs_results[i],
+            np.concatenate((inputs_results, previous_state[0]), axis=3))
+        self.assertEqual(states_results[i][0].shape,
+                         (batch_size, width, height, num_units))
+        self.assertEqual(states_results[i][1].shape,
+                         (batch_size, width, height, num_units))
+        self.assertAllEqual(states_results[i][0],
+                            np.multiply(previous_state[0], 2.0))
+        self.assertAllEqual(states_results[i][1], previous_state[1])
+class MultiInputRnnDecoderTest(tf.test.TestCase):
+  def test_rnn_decoder_single_unroll(self):
+    batch_size = 2
+    num_unroll = 1
+    num_units = 12
+    width = 8
+    height = 10
+    input_channels_large = 24
+    input_channels_small = 12
+    bottleneck_channels = 20
+    initial_state_c = tf.random_normal((batch_size, width, height, num_units))
+    initial_state_h = tf.random_normal((batch_size, width, height, num_units))
+    initial_state = (initial_state_c, initial_state_h)
+    inputs_large = tf.random_normal(
+        [batch_size, width, height, input_channels_large])
+    inputs_small = tf.random_normal(
+        [batch_size, width, height, input_channels_small])
+    rnn_cell = MockRnnCell(bottleneck_channels, num_units)
+    outputs, states = rnn_decoder.multi_input_rnn_decoder(
+        decoder_inputs=[[inputs_large] * num_unroll,
+                        [inputs_small] * num_unroll],
+        initial_state=initial_state,
+        cell=rnn_cell,
+        sequence_step=tf.zeros([batch_size]),
+        pre_bottleneck=True)
+    self.assertEqual(len(outputs), num_unroll)
+    self.assertEqual(len(states), num_unroll)
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      results = sess.run(
+          (outputs, states, inputs_large, inputs_small, initial_state))
+      outputs_results = results[0]
+      states_results = results[1]
+      inputs_large_results = results[2]
+      inputs_small_results = results[3]
+      initial_states_results = results[4]
+      self.assertEqual(
+          outputs_results[0].shape,
+          (batch_size, width, height, bottleneck_channels + num_units))
+      self.assertEqual(states_results[0][0].shape,
+                       (batch_size, width, height, num_units))
+      self.assertEqual(states_results[0][1].shape,
+                       (batch_size, width, height, num_units))
+      # The first step should always update state.
+      self.assertAllEqual(states_results[0][0],
+                              np.multiply(initial_states_results[0], 2))
+      self.assertAllEqual(states_results[0][1], initial_states_results[1])
+  def test_rnn_decoder_multiple_unroll(self):
+    batch_size = 2
+    num_unroll = 3
+    num_units = 12
+    width = 8
+    height = 10
+    input_channels_large = 24
+    input_channels_small = 12
+    bottleneck_channels = 20
+    initial_state_c = tf.random_normal((batch_size, width, height, num_units))
+    initial_state_h = tf.random_normal((batch_size, width, height, num_units))
+    initial_state = (initial_state_c, initial_state_h)
+    inputs_large = tf.random_normal(
+        [batch_size, width, height, input_channels_large])
+    inputs_small = tf.random_normal(
+        [batch_size, width, height, input_channels_small])
+    rnn_cell = MockRnnCell(bottleneck_channels, num_units)
+    outputs, states = rnn_decoder.multi_input_rnn_decoder(
+        decoder_inputs=[[inputs_large] * num_unroll,
+                        [inputs_small] * num_unroll],
+        initial_state=initial_state,
+        cell=rnn_cell,
+        sequence_step=tf.zeros([batch_size]),
+        pre_bottleneck=True)
+    self.assertEqual(len(outputs), num_unroll)
+    self.assertEqual(len(states), num_unroll)
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      results = sess.run(
+          (outputs, states, inputs_large, inputs_small, initial_state))
+      outputs_results = results[0]
+      states_results = results[1]
+      inputs_large_results = results[2]
+      inputs_small_results = results[3]
+      initial_states_results = results[4]
+      # The first step should always update state.
+      self.assertAllEqual(states_results[0][0],
+                          np.multiply(initial_states_results[0], 2))
+      self.assertAllEqual(states_results[0][1], initial_states_results[1])
+      for i in range(num_unroll):
+        self.assertEqual(
+            outputs_results[i].shape,
+            (batch_size, width, height, bottleneck_channels + num_units))
+        self.assertEqual(states_results[i][0].shape,
+                         (batch_size, width, height, num_units))
+        self.assertEqual(states_results[i][1].shape,
+                         (batch_size, width, height, num_units))
+  def test_rnn_decoder_multiple_unroll_with_skip(self):
+    batch_size = 2
+    num_unroll = 5
+    num_units = 12
+    width = 8
+    height = 10
+    input_channels_large = 24
+    input_channels_small = 12
+    bottleneck_channels = 20
+    skip = 2
+    initial_state_c = tf.random_normal((batch_size, width, height, num_units))
+    initial_state_h = tf.random_normal((batch_size, width, height, num_units))
+    initial_state = (initial_state_c, initial_state_h)
+    inputs_large = tf.random_normal(
+        [batch_size, width, height, input_channels_large])
+    inputs_small = tf.random_normal(
+        [batch_size, width, height, input_channels_small])
+    rnn_cell = MockRnnCell(bottleneck_channels, num_units)
+    outputs, states = rnn_decoder.multi_input_rnn_decoder(
+        decoder_inputs=[[inputs_large] * num_unroll,
+                        [inputs_small] * num_unroll],
+        initial_state=initial_state,
+        cell=rnn_cell,
+        sequence_step=tf.zeros([batch_size]),
+        pre_bottleneck=True,
+        selection_strategy='SKIP%d' % skip)
+    self.assertEqual(len(outputs), num_unroll)
+    self.assertEqual(len(states), num_unroll)
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      results = sess.run(
+          (outputs, states, inputs_large, inputs_small, initial_state))
+      outputs_results = results[0]
+      states_results = results[1]
+      inputs_large_results = results[2]
+      inputs_small_results = results[3]
+      initial_states_results = results[4]
+      for i in range(num_unroll):
+        self.assertEqual(
+            outputs_results[i].shape,
+            (batch_size, width, height, bottleneck_channels + num_units))
+        self.assertEqual(states_results[i][0].shape,
+                         (batch_size, width, height, num_units))
+        self.assertEqual(states_results[i][1].shape,
+                         (batch_size, width, height, num_units))
+        previous_state = (
+            initial_states_results if i == 0 else states_results[i - 1])
+        # State only updates during key frames
+        if i % (skip + 1) == 0:
+          self.assertAllEqual(states_results[i][0],
+                              np.multiply(previous_state[0], 2))
+          self.assertAllEqual(states_results[i][1], previous_state[1])
+        else:
+          self.assertAllEqual(states_results[i][0], previous_state[0])
+          self.assertAllEqual(states_results[i][1], previous_state[1])
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/lstm_object_detection/lstm/utils.py
+++ b/research/lstm_object_detection/lstm/utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Quantization related ops for LSTM."""
+from __future__ import absolute_import
+from __future__ import division
+import tensorflow as tf
+from tensorflow.python.training import moving_averages
+def _quant_var(
+    name,
+    initializer_val,
+    vars_collection=tf.GraphKeys.MOVING_AVERAGE_VARIABLES,
+):
+  """Create an var for storing the min/max quantization range."""
+  return tf.contrib.framework.model_variable(
+      name,
+      shape=[],
+      initializer=tf.constant_initializer(initializer_val),
+      collections=[vars_collection],
+      trainable=False)
+def quantizable_concat(inputs,
+                       axis,
+                       is_training,
+                       is_quantized=True,
+                       default_min=0,
+                       default_max=6,
+                       ema_decay=0.999,
+                       scope='quantized_concat'):
+  """Concat replacement with quantization option.
+  Allows concat inputs to share the same min max ranges,
+  from experimental/gazelle/synthetic/model/tpu/utils.py.
+  Args:
+    inputs: list of tensors to concatenate.
+    axis: dimension along which to concatenate.
+    is_training: true if the graph is a training graph.
+    is_quantized: flag to enable/disable quantization.
+    default_min: default min value for fake quant op.
+    default_max: default max value for fake quant op.
+    ema_decay: the moving average decay for the quantization variables.
+    scope: Optional scope for variable_scope.
+  Returns:
+    Tensor resulting from concatenation of input tensors
+  """
+  if is_quantized:
+    with tf.variable_scope(scope):
+      tf.logging.info('inputs: {}'.format(inputs))
+      for t in inputs:
+        tf.logging.info(t)
+      min_var = _quant_var('min', default_min)
+      max_var = _quant_var('max', default_max)
+      if not is_training:
+        # If we are building an eval graph just use the values in the variables.
+        quant_inputs = [
+            tf.fake_quant_with_min_max_vars(t, min_var, max_var) for t in inputs
+        ]
+        tf.logging.info('min_val: {}'.format(min_var))
+        tf.logging.info('max_val: {}'.format(max_var))
+      else:
+        concat_tensors = tf.concat(inputs, axis=axis)
+        tf.logging.info('concat_tensors: {}'.format(concat_tensors))
+        # Otherwise we need to keep track of the moving averages of the min and
+        # of the elements of the input tensor max.
+        min_val = moving_averages.assign_moving_average(
+            min_var,
+            tf.reduce_min(concat_tensors),
+            ema_decay,
+            name='AssignMinEma')
+        max_val = moving_averages.assign_moving_average(
+            max_var,
+            tf.reduce_max(concat_tensors),
+            ema_decay,
+            name='AssignMaxEma')
+        tf.logging.info('min_val: {}'.format(min_val))
+        tf.logging.info('max_val: {}'.format(max_val))
+        quant_inputs = [
+            tf.fake_quant_with_min_max_vars(t, min_val, max_val) for t in inputs
+        ]
+      tf.logging.info('quant_inputs: {}'.format(quant_inputs))
+      outputs = tf.concat(quant_inputs, axis=axis)
+      tf.logging.info('outputs: {}'.format(outputs))
+  else:
+    outputs = tf.concat(inputs, axis=axis)
+  return outputs
+def quantizable_separable_conv2d(inputs,
+                                 num_outputs,
+                                 kernel_size,
+                                 is_quantized=True,
+                                 depth_multiplier=1,
+                                 stride=1,
+                                 activation_fn=tf.nn.relu6,
+                                 normalizer_fn=None,
+                                 scope=None):
+  """Quantization friendly backward compatible separable conv2d.
+  This op has the same API is separable_conv2d. The main difference is that an
+  additional BiasAdd is manually inserted after the depthwise conv, such that
+  the depthwise bias will not have name conflict with pointwise bias. The
+  motivation of this op is that quantization script need BiasAdd in order to
+  recognize the op, in which a native call to separable_conv2d do not create
+  for the depthwise conv.
+  Args:
+    inputs: A tensor of size [batch_size, height, width, channels].
+    num_outputs: The number of pointwise convolution output filters. If is
+      None, then we skip the pointwise convolution stage.
+    kernel_size: A list of length 2: [kernel_height, kernel_width] of the
+      filters. Can be an int if both values are the same.
+    is_quantized: flag to enable/disable quantization.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to num_filters_in * depth_multiplier.
+    stride: A list of length 2: [stride_height, stride_width], specifying the
+      depthwise convolution stride. Can be an int if both strides are the same.
+    activation_fn: Activation function. The default value is a ReLU function.
+      Explicitly set it to None to skip it and maintain a linear activation.
+    normalizer_fn: Normalization function to use instead of biases.
+    scope: Optional scope for variable_scope.
+  Returns:
+    Tensor resulting from concatenation of input tensors
+  """
+  if is_quantized:
+    outputs = tf.contrib.layers.separable_conv2d(
+        inputs,
+        None,
+        kernel_size,
+        depth_multiplier=depth_multiplier,
+        stride=1,
+        activation_fn=None,
+        normalizer_fn=None,
+        biases_initializer=None,
+        scope=scope)
+    outputs = tf.contrib.layers.bias_add(
+        outputs, trainable=True, scope='%s_bias' % scope)
+    outputs = tf.contrib.layers.conv2d(
+        outputs,
+        num_outputs, [1, 1],
+        activation_fn=activation_fn,
+        stride=stride,
+        normalizer_fn=normalizer_fn,
+        scope=scope)
+  else:
+    outputs = tf.contrib.layers.separable_conv2d(
+        inputs,
+        num_outputs,
+        kernel_size,
+        depth_multiplier=depth_multiplier,
+        stride=stride,
+        activation_fn=activation_fn,
+        normalizer_fn=normalizer_fn,
+        scope=scope)
+  return outputs
+def quantize_op(inputs,
+                is_training=True,
+                is_quantized=True,
+                default_min=0,
+                default_max=6,
+                ema_decay=0.999,
+                scope='quant'):
+  """Inserts a fake quantization op after inputs.
+  Args:
+    inputs: A tensor of size [batch_size, height, width, channels].
+    is_training: true if the graph is a training graph.
+    is_quantized: flag to enable/disable quantization.
+    default_min: default min value for fake quant op.
+    default_max: default max value for fake quant op.
+    ema_decay: the moving average decay for the quantization variables.
+    scope: Optional scope for variable_scope.
+  Returns:
+    Tensor resulting from quantizing the input tensors.
+  """
+  if is_quantized:
+    with tf.variable_scope(scope):
+      min_var = _quant_var('min', default_min)
+      max_var = _quant_var('max', default_max)
+      if is_training:
+        min_val = moving_averages.assign_moving_average(
+            min_var, tf.reduce_min(inputs), ema_decay, name='AssignMinEma')
+        max_val = moving_averages.assign_moving_average(
+            max_var, tf.reduce_max(inputs), ema_decay, name='AssignMaxEma')
+        inputs = tf.fake_quant_with_min_max_vars(inputs, min_val, max_val)
+      else:
+        inputs = tf.fake_quant_with_min_max_vars(inputs, min_var, max_var)
+  return inputs
--- a/research/lstm_object_detection/lstm/utils_test.py
+++ b/research/lstm_object_detection/lstm/utils_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lstm_object_detection.lstm.utils."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from lstm_object_detection.lstm import utils
+class QuantizableUtilsTest(tf.test.TestCase):
+  def test_quantizable_concat_is_training(self):
+    inputs_1 = tf.zeros([4, 10, 10, 1], dtype=tf.float32)
+    inputs_2 = tf.ones([4, 10, 10, 2], dtype=tf.float32)
+    concat_in_train = utils.quantizable_concat([inputs_1, inputs_2],
+                                               axis=3,
+                                               is_training=True)
+    self.assertAllEqual([4, 10, 10, 3], concat_in_train.shape.as_list())
+    self._check_min_max_ema(tf.get_default_graph())
+    self._check_min_max_vars(tf.get_default_graph())
+  def test_quantizable_concat_inference(self):
+    inputs_1 = tf.zeros([4, 10, 10, 1], dtype=tf.float32)
+    inputs_2 = tf.ones([4, 10, 10, 2], dtype=tf.float32)
+    concat_in_train = utils.quantizable_concat([inputs_1, inputs_2],
+                                               axis=3,
+                                               is_training=False)
+    self.assertAllEqual([4, 10, 10, 3], concat_in_train.shape.as_list())
+    self._check_no_min_max_ema(tf.get_default_graph())
+    self._check_min_max_vars(tf.get_default_graph())
+  def test_quantizable_concat_not_quantized_is_training(self):
+    inputs_1 = tf.zeros([4, 10, 10, 1], dtype=tf.float32)
+    inputs_2 = tf.ones([4, 10, 10, 2], dtype=tf.float32)
+    concat_in_train = utils.quantizable_concat([inputs_1, inputs_2],
+                                               axis=3,
+                                               is_training=True,
+                                               is_quantized=False)
+    self.assertAllEqual([4, 10, 10, 3], concat_in_train.shape.as_list())
+    self._check_no_min_max_ema(tf.get_default_graph())
+    self._check_no_min_max_vars(tf.get_default_graph())
+  def test_quantizable_concat_not_quantized_inference(self):
+    inputs_1 = tf.zeros([4, 10, 10, 1], dtype=tf.float32)
+    inputs_2 = tf.ones([4, 10, 10, 2], dtype=tf.float32)
+    concat_in_train = utils.quantizable_concat([inputs_1, inputs_2],
+                                               axis=3,
+                                               is_training=False,
+                                               is_quantized=False)
+    self.assertAllEqual([4, 10, 10, 3], concat_in_train.shape.as_list())
+    self._check_no_min_max_ema(tf.get_default_graph())
+    self._check_no_min_max_vars(tf.get_default_graph())
+  def test_quantize_op_is_training(self):
+    inputs = tf.zeros([4, 10, 10, 128], dtype=tf.float32)
+    outputs = utils.quantize_op(inputs)
+    self.assertAllEqual(inputs.shape.as_list(), outputs.shape.as_list())
+    self._check_min_max_ema(tf.get_default_graph())
+    self._check_min_max_vars(tf.get_default_graph())
+  def test_quantize_op_inferene(self):
+    inputs = tf.zeros([4, 10, 10, 128], dtype=tf.float32)
+    outputs = utils.quantize_op(inputs, is_training=False)
+    self.assertAllEqual(inputs.shape.as_list(), outputs.shape.as_list())
+    self._check_no_min_max_ema(tf.get_default_graph())
+    self._check_min_max_vars(tf.get_default_graph())
+  def _check_min_max_vars(self, graph):
+    op_types = [op.type for op in graph.get_operations()]
+    self.assertTrue(
+        any('FakeQuantWithMinMaxVars' in op_type for op_type in op_types))
+  def _check_min_max_ema(self, graph):
+    op_names = [op.name for op in graph.get_operations()]
+    self.assertTrue(any('AssignMinEma' in name for name in op_names))
+    self.assertTrue(any('AssignMaxEma' in name for name in op_names))
+  def _check_no_min_max_vars(self, graph):
+    op_types = [op.type for op in graph.get_operations()]
+    self.assertFalse(
+        any('FakeQuantWithMinMaxVars' in op_type for op_type in op_types))
+  def _check_no_min_max_ema(self, graph):
+    op_names = [op.name for op in graph.get_operations()]
+    self.assertFalse(any('AssignMinEma' in name for name in op_names))
+    self.assertFalse(any('AssignMaxEma' in name for name in op_names))
+class QuantizableSeparableConv2dTest(tf.test.TestCase):
+  def test_quantizable_separable_conv2d(self):
+    inputs = tf.zeros([4, 10, 10, 128], dtype=tf.float32)
+    num_outputs = 64
+    kernel_size = [3, 3]
+    scope = 'QuantSeparable'
+    outputs = utils.quantizable_separable_conv2d(
+        inputs, num_outputs, kernel_size, scope=scope)
+    self.assertAllEqual([4, 10, 10, num_outputs], outputs.shape.as_list())
+    self._check_depthwise_bias_add(tf.get_default_graph(), scope)
+  def test_quantizable_separable_conv2d_not_quantized(self):
+    inputs = tf.zeros([4, 10, 10, 128], dtype=tf.float32)
+    num_outputs = 64
+    kernel_size = [3, 3]
+    scope = 'QuantSeparable'
+    outputs = utils.quantizable_separable_conv2d(
+        inputs, num_outputs, kernel_size, is_quantized=False, scope=scope)
+    self.assertAllEqual([4, 10, 10, num_outputs], outputs.shape.as_list())
+    self._check_no_depthwise_bias_add(tf.get_default_graph(), scope)
+  def _check_depthwise_bias_add(self, graph, scope):
+    op_names = [op.name for op in graph.get_operations()]
+    self.assertTrue(
+        any('%s_bias/BiasAdd' % scope in name for name in op_names))
+  def _check_no_depthwise_bias_add(self, graph, scope):
+    op_names = [op.name for op in graph.get_operations()]
+    self.assertFalse(
+        any('%s_bias/BiasAdd' % scope in name for name in op_names))
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/lstm_object_detection/meta_architectures/__init__.py
+++ b/research/lstm_object_detection/meta_architectures/__init__.py