PiperOrigin-RevId: 206648257

0d8e49ec · Yinxiao Li · dreamdragon · d7676c1c · 0d8e49ec · 0d8e49ec
Commit 0d8e49ec authored Jul 30, 2018 by Yinxiao Li Committed by dreamdragon Oct 24, 2018
20 changed files
--- a/research/lstm_object_detection/__init__.py
+++ b/research/lstm_object_detection/__init__.py
--- a/research/lstm_object_detection/configs/lstm_ssd_mobilenet_v1_imagenet.config
+++ b/research/lstm_object_detection/configs/lstm_ssd_mobilenet_v1_imagenet.config
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# For training on Imagenet Video with LSTM Mobilenet V1
+[object_detection.protos.lstm_model] {
+  train_unroll_length: 4
+  eval_unroll_length: 4
+}
+model {
+  ssd {
+    num_classes: 30
+    box_coder {
+      faster_rcnn_box_coder {
+        y_scale: 10.0
+        x_scale: 10.0
+        height_scale: 5.0
+        width_scale: 5.0
+      }
+    }
+    matcher {
+      argmax_matcher {
+        matched_threshold: 0.5
+        unmatched_threshold: 0.5
+        ignore_thresholds: false
+        negatives_lower_than_unmatched: true
+        force_match_for_each_row: true
+      }
+    }
+    similarity_calculator {
+      iou_similarity {
+      }
+    }
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 5
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 3.0
+        aspect_ratios: 0.3333
+      }
+    }
+    image_resizer {
+      fixed_shape_resizer {
+        height: 256
+        width: 256
+      }
+    }
+    box_predictor {
+      convolutional_box_predictor {
+        min_depth: 0
+        max_depth: 0
+        num_layers_before_predictor: 3
+        use_dropout: false
+        dropout_keep_probability: 0.8
+        kernel_size: 3
+        box_code_size: 4
+        apply_sigmoid_to_scores: false
+        use_depthwise: true
+        conv_hyperparams {
+          activation: RELU_6,
+          regularizer {
+            l2_regularizer {
+              weight: 0.00004
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+              stddev: 0.03
+              mean: 0.0
+            }
+          }
+          batch_norm {
+            train: true,
+            scale: true,
+            center: true,
+            decay: 0.9997,
+            epsilon: 0.001,
+          }
+        }
+      }
+    }
+    feature_extractor {
+      type: 'lstm_mobilenet_v1'
+      min_depth: 16
+      depth_multiplier: 1.0
+      use_depthwise: true
+      conv_hyperparams {
+        activation: RELU_6,
+        regularizer {
+          l2_regularizer {
+            weight: 0.00004
+          }
+        }
+        initializer {
+          truncated_normal_initializer {
+            stddev: 0.03
+            mean: 0.0
+          }
+        }
+        batch_norm {
+          train: true,
+          scale: true,
+          center: true,
+          decay: 0.9997,
+          epsilon: 0.001,
+        }
+      }
+    }
+    loss {
+      classification_loss {
+        weighted_sigmoid {
+        }
+      }
+      localization_loss {
+        weighted_smooth_l1 {
+        }
+      }
+      hard_example_miner {
+        num_hard_examples: 3000
+        iou_threshold: 0.99
+        loss_type: CLASSIFICATION
+        max_negatives_per_positive: 3
+        min_negatives_per_image: 0
+      }
+      classification_weight: 1.0
+      localization_weight: 4.0
+    }
+    normalize_loss_by_num_matches: true
+    post_processing {
+      batch_non_max_suppression {
+        score_threshold: -20.0
+        iou_threshold: 0.5
+        max_detections_per_class: 100
+        max_total_detections: 100
+      }
+      score_converter: SIGMOID
+    }
+  }
+}
+train_config: {
+  batch_size: 8
+  data_augmentation_options {
+    random_horizontal_flip {
+    }
+  }
+  data_augmentation_options {
+    ssd_random_crop {
+    }
+  }
+  optimizer {
+    use_moving_average: false
+    rms_prop_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.002
+          decay_steps: 200000
+          decay_factor: 0.95
+        }
+      }
+      momentum_optimizer_value: 0.9
+      decay: 0.9
+      epsilon: 1.0
+    }
+  }
+  from_detection_checkpoint: true
+  gradient_clipping_by_norm: 10.0
+  batch_queue_capacity: 12
+  prefetch_queue_capacity: 4
+  fine_tune_checkpoint: "/path/to/checkpoint/"
+  fine_tune_checkpoint_type: "detection"
+}
+train_input_reader: {
+  shuffle_buffer_size: 32
+  queue_capacity: 12
+  prefetch_size: 12
+  min_after_dequeue: 4
+  label_map_path: "path/to/label_map"
+  external_input_reader {
+    [lstm_object_detection.input_readers.GoogleInputReader.google_input_reader] {
+      tf_record_video_input_reader: {
+        input_path: "your/cns/path"
+        data_type: TF_SEQUENCE_EXAMPLE
+        video_length: 4
+      }
+    }
+  }
+}
+eval_config: {
+  metrics_set: "coco_evaluation_last_frame"
+  use_moving_averages: true
+  min_score_threshold: 0.5
+  max_num_boxes_to_visualize: 300
+  visualize_groundtruth_boxes: true
+  groundtruth_box_visualization_color: "red"
+}
+eval_input_reader: {
+  label_map_path: "path/to/label_map"
+  external_input_reader {
+    [lstm_object_detection.input_readers.GoogleInputReader.google_input_reader] {
+      tf_record_video_input_reader: {
+        input_path: "your/cns/path"
+        data_type: TF_SEQUENCE_EXAMPLE
+        video_length: 4
+      }
+    }
+  }
+  shuffle: true
+  num_readers: 1
+}
--- a/research/lstm_object_detection/eval.py
+++ b/research/lstm_object_detection/eval.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Evaluation executable for detection models.
+This executable is used to evaluate DetectionModels. Example usage:
+    ./eval \
+        --logtostderr \
+        --checkpoint_dir=path/to/checkpoint_dir \
+        --eval_dir=path/to/eval_dir \
+        --pipeline_config_path=pipeline_config.pbtxt
+"""
+import functools
+import os
+import tensorflow as tf
+from google.protobuf import text_format
+from google3.pyglib import app
+from google3.pyglib import flags
+from lstm_object_detection import evaluator
+from lstm_object_detection import model_builder
+from lstm_object_detection import seq_dataset_builder
+from lstm_object_detection.utils import config_util
+from google3.third_party.tensorflow_models.object_detection.utils import label_map_util
+tf.logging.set_verbosity(tf.logging.INFO)
+flags = tf.app.flags
+flags.DEFINE_boolean('eval_training_data', False,
+                     'If training data should be evaluated for this job.')
+flags.DEFINE_string('checkpoint_dir', '',
+                    'Directory containing checkpoints to evaluate, typically '
+                    'set to `train_dir` used in the training job.')
+flags.DEFINE_string('eval_dir', '', 'Directory to write eval summaries to.')
+flags.DEFINE_string('pipeline_config_path', '',
+                    'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
+                    'file. If provided, other configs are ignored')
+flags.DEFINE_boolean('run_once', False, 'Option to only run a single pass of '
+                     'evaluation. Overrides the `max_evals` parameter in the '
+                     'provided config.')
+FLAGS = flags.FLAGS
+def main(unused_argv):
+  assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.'
+  assert FLAGS.eval_dir, '`eval_dir` is missing.'
+  if FLAGS.pipeline_config_path:
+    configs = config_util.get_configs_from_pipeline_file(
+        FLAGS.pipeline_config_path)
+  else:
+    configs = config_util.get_configs_from_multiple_files(
+        model_config_path=FLAGS.model_config_path,
+        eval_config_path=FLAGS.eval_config_path,
+        eval_input_config_path=FLAGS.input_config_path)
+  pipeline_proto = config_util.create_pipeline_proto_from_configs(configs)
+  config_text = text_format.MessageToString(pipeline_proto)
+  tf.gfile.MakeDirs(FLAGS.eval_dir)
+  with tf.gfile.Open(os.path.join(FLAGS.eval_dir, 'pipeline.config'),
+                     'wb') as f:
+    f.write(config_text)
+  model_config = configs['model']
+  lstm_config = configs['lstm_model']
+  eval_config = configs['eval_config']
+  input_config = configs['eval_input_config']
+  if FLAGS.eval_training_data:
+    input_config.external_input_reader.CopyFrom(
+        configs['train_input_config'].external_input_reader)
+    lstm_config.eval_unroll_length = lstm_config.train_unroll_length
+  model_fn = functools.partial(
+      model_builder.build,
+      model_config=model_config,
+      lstm_config=lstm_config,
+      is_training=False)
+  def get_next(config, model_config, lstm_config, unroll_length):
+    return seq_dataset_builder.build(config, model_config, lstm_config,
+                                     unroll_length)
+  create_input_dict_fn = functools.partial(get_next, input_config, model_config,
+                                           lstm_config,
+                                           lstm_config.eval_unroll_length)
+  label_map = label_map_util.load_labelmap(input_config.label_map_path)
+  max_num_classes = max([item.id for item in label_map.item])
+  categories = label_map_util.convert_label_map_to_categories(
+      label_map, max_num_classes)
+  if FLAGS.run_once:
+    eval_config.max_evals = 1
+  evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories,
+                     FLAGS.checkpoint_dir, FLAGS.eval_dir)
+if __name__ == '__main__':
+  app.run()
--- a/research/lstm_object_detection/evaluator.py
+++ b/research/lstm_object_detection/evaluator.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Detection model evaluator.
+This file provides a generic evaluation method that can be used to evaluate a
+DetectionModel.
+"""
+import logging
+import tensorflow as tf
+from lstm_object_detection.metrics import coco_evaluation_all_frames
+from google3.third_party.tensorflow_models.object_detection import eval_util
+from google3.third_party.tensorflow_models.object_detection.core import prefetcher
+from google3.third_party.tensorflow_models.object_detection.core import standard_fields as fields
+from google3.third_party.tensorflow_models.object_detection.metrics import coco_evaluation
+from google3.third_party.tensorflow_models.object_detection.utils import object_detection_evaluation
+# A dictionary of metric names to classes that implement the metric. The classes
+# in the dictionary must implement
+# utils.object_detection_evaluation.DetectionEvaluator interface.
+EVAL_METRICS_CLASS_DICT = {
+    'pascal_voc_detection_metrics':
+        object_detection_evaluation.PascalDetectionEvaluator,
+    'weighted_pascal_voc_detection_metrics':
+        object_detection_evaluation.WeightedPascalDetectionEvaluator,
+    'pascal_voc_instance_segmentation_metrics':
+        object_detection_evaluation.PascalInstanceSegmentationEvaluator,
+    'weighted_pascal_voc_instance_segmentation_metrics':
+        object_detection_evaluation.WeightedPascalInstanceSegmentationEvaluator,
+    'open_images_detection_metrics':
+        object_detection_evaluation.OpenImagesDetectionEvaluator,
+    'coco_detection_metrics':
+        coco_evaluation.CocoDetectionEvaluator,
+    'coco_mask_metrics':
+        coco_evaluation.CocoMaskEvaluator,
+    'coco_evaluation_all_frames':
+        coco_evaluation_all_frames.CocoEvaluationAllFrames,
+}
+EVAL_DEFAULT_METRIC = 'pascal_voc_detection_metrics'
+def _create_detection_op(model, input_dict, batch):
+  """Create detection ops.
+  Args:
+    model: model to perform predictions with.
+    input_dict: A dict holds input data.
+    batch: batch size for evaluation.
+  Returns:
+    Detection tensor ops.
+  """
+  video_tensor = tf.stack(list(input_dict[fields.InputDataFields.image]))
+  preprocessed_video, true_image_shapes = model.preprocess(
+      tf.to_float(video_tensor))
+  if batch is not None:
+    prediction_dict = model.predict(preprocessed_video, true_image_shapes,
+                                    batch)
+  else:
+    prediction_dict = model.predict(preprocessed_video, true_image_shapes)
+  return model.postprocess(prediction_dict, true_image_shapes)
+def _extract_prediction_tensors(model,
+                                create_input_dict_fn,
+                                ignore_groundtruth=False):
+  """Restores the model in a tensorflow session.
+  Args:
+    model: model to perform predictions with.
+    create_input_dict_fn: function to create input tensor dictionaries.
+    ignore_groundtruth: whether groundtruth should be ignored.
+  Returns:
+    tensor_dict: A tensor dictionary with evaluations.
+  """
+  input_dict = create_input_dict_fn()
+  batch = None
+  if 'batch' in input_dict:
+    batch = input_dict.pop('batch')
+  else:
+    prefetch_queue = prefetcher.prefetch(input_dict, capacity=500)
+    input_dict = prefetch_queue.dequeue()
+    # consistent format for images and videos
+    for key, value in input_dict.iteritems():
+      input_dict[key] = (value,)
+  detections = _create_detection_op(model, input_dict, batch)
+  # Print out anaylsis of the model.
+  tf.contrib.tfprof.model_analyzer.print_model_analysis(
+      tf.get_default_graph(),
+      tfprof_options=tf.contrib.tfprof.model_analyzer.
+      TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
+  tf.contrib.tfprof.model_analyzer.print_model_analysis(
+      tf.get_default_graph(),
+      tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
+  num_frames = len(input_dict[fields.InputDataFields.image])
+  ret = []
+  for i in range(num_frames):
+    original_image = tf.expand_dims(input_dict[fields.InputDataFields.image][i],
+                                    0)
+    groundtruth = None
+    if not ignore_groundtruth:
+      groundtruth = {
+          fields.InputDataFields.groundtruth_boxes:
+              input_dict[fields.InputDataFields.groundtruth_boxes][i],
+          fields.InputDataFields.groundtruth_classes:
+              input_dict[fields.InputDataFields.groundtruth_classes][i],
+      }
+      optional_keys = (
+          fields.InputDataFields.groundtruth_area,
+          fields.InputDataFields.groundtruth_is_crowd,
+          fields.InputDataFields.groundtruth_difficult,
+          fields.InputDataFields.groundtruth_group_of,
+      )
+      for opt_key in optional_keys:
+        if opt_key in input_dict:
+          groundtruth[opt_key] = input_dict[opt_key][i]
+      if fields.DetectionResultFields.detection_masks in detections:
+        groundtruth[fields.InputDataFields.groundtruth_instance_masks] = (
+            input_dict[fields.InputDataFields.groundtruth_instance_masks][i])
+    detections_frame = {
+        key: tf.expand_dims(value[i], 0)
+        for key, value in detections.iteritems()
+    }
+    source_id = (
+        batch.key[0] if batch is not None else
+        input_dict[fields.InputDataFields.source_id][i])
+    ret.append(
+        eval_util.result_dict_for_single_example(
+            original_image,
+            source_id,
+            detections_frame,
+            groundtruth,
+            class_agnostic=(fields.DetectionResultFields.detection_classes
+                            not in detections),
+            scale_to_absolute=True))
+  return ret
+def get_evaluators(eval_config, categories):
+  """Returns the evaluator class according to eval_config, valid for categories.
+  Args:
+    eval_config: evaluation configurations.
+    categories: a list of categories to evaluate.
+  Returns:
+    An list of instances of DetectionEvaluator.
+  Raises:
+    ValueError: if metric is not in the metric class dictionary.
+  """
+  eval_metric_fn_keys = eval_config.metrics_set
+  if not eval_metric_fn_keys:
+    eval_metric_fn_keys = [EVAL_DEFAULT_METRIC]
+  evaluators_list = []
+  for eval_metric_fn_key in eval_metric_fn_keys:
+    if eval_metric_fn_key not in EVAL_METRICS_CLASS_DICT:
+      raise ValueError('Metric not found: {}'.format(eval_metric_fn_key))
+    else:
+      evaluators_list.append(
+          EVAL_METRICS_CLASS_DICT[eval_metric_fn_key](categories=categories))
+  return evaluators_list
+def evaluate(create_input_dict_fn,
+             create_model_fn,
+             eval_config,
+             categories,
+             checkpoint_dir,
+             eval_dir,
+             graph_hook_fn=None):
+  """Evaluation function for detection models.
+  Args:
+    create_input_dict_fn: a function to create a tensor input dictionary.
+    create_model_fn: a function that creates a DetectionModel.
+    eval_config: a eval_pb2.EvalConfig protobuf.
+    categories: a list of category dictionaries. Each dict in the list should
+                have an integer 'id' field and string 'name' field.
+    checkpoint_dir: directory to load the checkpoints to evaluate from.
+    eval_dir: directory to write evaluation metrics summary to.
+    graph_hook_fn: Optional function that is called after the training graph is
+      completely built. This is helpful to perform additional changes to the
+      training graph such as optimizing batchnorm. The function should modify
+      the default graph.
+  Returns:
+    metrics: A dictionary containing metric names and values from the latest
+      run.
+  """
+  model = create_model_fn()
+  if eval_config.ignore_groundtruth and not eval_config.export_path:
+    logging.fatal('If ignore_groundtruth=True then an export_path is '
+                  'required. Aborting!!!')
+  tensor_dicts = _extract_prediction_tensors(
+      model=model,
+      create_input_dict_fn=create_input_dict_fn,
+      ignore_groundtruth=eval_config.ignore_groundtruth)
+  def _process_batch(tensor_dicts,
+                     sess,
+                     batch_index,
+                     counters,
+                     losses_dict=None):
+    """Evaluates tensors in tensor_dicts, visualizing the first K examples.
+    This function calls sess.run on tensor_dicts, evaluating the original_image
+    tensor only on the first K examples and visualizing detections overlaid
+    on this original_image.
+    Args:
+      tensor_dicts: a dictionary of tensors
+      sess: tensorflow session
+      batch_index: the index of the batch amongst all batches in the run.
+      counters: a dictionary holding 'success' and 'skipped' fields which can
+        be updated to keep track of number of successful and failed runs,
+        respectively.  If these fields are not updated, then the success/skipped
+        counter values shown at the end of evaluation will be incorrect.
+      losses_dict: Optional dictonary of scalar loss tensors. Necessary only
+        for matching function signiture in third_party eval_util.py.
+    Returns:
+      result_dict: a dictionary of numpy arrays
+      result_losses_dict: a dictionary of scalar losses. This is empty if input
+        losses_dict is None. Necessary only for matching function signiture in
+        third_party eval_util.py.
+    """
+    if batch_index % 10 == 0:
+      logging.info('Running eval ops batch %d', batch_index)
+    if not losses_dict:
+      losses_dict = {}
+    try:
+      result_dicts, result_losses_dict = sess.run([tensor_dicts, losses_dict])
+      counters['success'] += 1
+    except tf.errors.InvalidArgumentError:
+      logging.info('Skipping image')
+      counters['skipped'] += 1
+      return {}
+    num_images = len(tensor_dicts)
+    for i in range(num_images):
+      result_dict = result_dicts[i]
+      global_step = tf.train.global_step(sess, tf.train.get_global_step())
+      tag = 'image-%d' % (batch_index * num_images + i)
+      if batch_index < eval_config.num_visualizations / num_images:
+        eval_util.visualize_detection_results(
+            result_dict,
+            tag,
+            global_step,
+            categories=categories,
+            summary_dir=eval_dir,
+            export_dir=eval_config.visualization_export_dir,
+            show_groundtruth=eval_config.visualize_groundtruth_boxes,
+            groundtruth_box_visualization_color=eval_config.
+            groundtruth_box_visualization_color,
+            min_score_thresh=eval_config.min_score_threshold,
+            max_num_predictions=eval_config.max_num_boxes_to_visualize,
+            skip_scores=eval_config.skip_scores,
+            skip_labels=eval_config.skip_labels,
+            keep_image_id_for_visualization_export=eval_config.
+            keep_image_id_for_visualization_export)
+    if num_images > 1:
+      return result_dicts, result_losses_dict
+    else:
+      return result_dicts[0], result_losses_dict
+  variables_to_restore = tf.global_variables()
+  global_step = tf.train.get_or_create_global_step()
+  variables_to_restore.append(global_step)
+  if graph_hook_fn:
+    graph_hook_fn()
+  if eval_config.use_moving_averages:
+    variable_averages = tf.train.ExponentialMovingAverage(0.0)
+    variables_to_restore = variable_averages.variables_to_restore()
+    for key in variables_to_restore.keys():
+      if 'moving_mean' in key:
+        variables_to_restore[key.replace(
+            'moving_mean', 'moving_mean/ExponentialMovingAverage')] = (
+                variables_to_restore[key])
+        del variables_to_restore[key]
+      if 'moving_variance' in key:
+        variables_to_restore[key.replace(
+            'moving_variance', 'moving_variance/ExponentialMovingAverage')] = (
+                variables_to_restore[key])
+        del variables_to_restore[key]
+  saver = tf.train.Saver(variables_to_restore)
+  def _restore_latest_checkpoint(sess):
+    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
+    saver.restore(sess, latest_checkpoint)
+  metrics = eval_util.repeated_checkpoint_run(
+      tensor_dict=tensor_dicts,
+      summary_dir=eval_dir,
+      evaluators=get_evaluators(eval_config, categories),
+      batch_processor=_process_batch,
+      checkpoint_dirs=[checkpoint_dir],
+      variables_to_restore=None,
+      restore_fn=_restore_latest_checkpoint,
+      num_batches=eval_config.num_examples,
+      eval_interval_secs=eval_config.eval_interval_secs,
+      max_number_of_evaluations=(1 if eval_config.ignore_groundtruth else
+                                 eval_config.max_evals
+                                 if eval_config.max_evals else None),
+      master=eval_config.eval_master,
+      save_graph=eval_config.save_graph,
+      save_graph_dir=(eval_dir if eval_config.save_graph else ''))
+  return metrics
--- a/research/lstm_object_detection/lstm/__init__.py
+++ b/research/lstm_object_detection/lstm/__init__.py
--- a/research/lstm_object_detection/lstm/lstm_cells.py
+++ b/research/lstm_object_detection/lstm/lstm_cells.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BottleneckConvLSTMCell implementation."""
+import google3
+import tensorflow.google as tf
+import google3.learning.brain.contrib.slim as slim
+from tensorflow.contrib.framework.python.ops import variables
+_batch_norm = tf.contrib.layers.batch_norm
+class BottleneckConvLSTMCell(tf.contrib.rnn.RNNCell):
+  """Basic LSTM recurrent network cell using separable convolutions.
+  The implementation is based on: http://arxiv.org/abs/1409.2329.
+  We add forget_bias (default: 1) to the biases of the forget gate in order to
+  reduce the scale of forgetting in the beginning of the training.
+  This LSTM first projects inputs to the size of the output before doing gate
+  computations. This saves params unless the input is less than a third of the
+  state size channel-wise.
+  """
+  def __init__(self,
+               filter_size,
+               output_size,
+               num_units,
+               forget_bias=1.0,
+               activation=tf.tanh,
+               flattened_state=False,
+               visualize_gates=True):
+    """Initializes the basic LSTM cell.
+    Args:
+      filter_size: collection, conv filter size
+      output_size: collection, the width/height dimensions of the cell/output
+      num_units: int, The number of channels in the LSTM cell.
+      forget_bias: float, The bias added to forget gates (see above).
+      activation: Activation function of the inner states.
+      flattened_state: if True, state tensor will be flattened and stored as
+        a 2-d tensor. Use for exporting the model to tfmini
+      visualize_gates: if True, add histogram summaries of all gates
+        and outputs to tensorboard
+    """
+    self._filter_size = list(filter_size)
+    self._output_size = list(output_size)
+    self._num_units = num_units
+    self._forget_bias = forget_bias
+    self._activation = activation
+    self._viz_gates = visualize_gates
+    self._flattened_state = flattened_state
+    self._param_count = self._num_units
+    for dim in self._output_size:
+      self._param_count *= dim
+  @property
+  def state_size(self):
+    return tf.contrib.rnn.LSTMStateTuple(self._output_size + [self._num_units],
+                                         self._output_size + [self._num_units])
+  @property
+  def state_size_flat(self):
+    return tf.contrib.rnn.LSTMStateTuple([self._param_count],
+                                         [self._param_count])
+  @property
+  def output_size(self):
+    return self._output_size + [self._num_units]
+  def __call__(self, inputs, state, scope=None):
+    """Long short-term memory cell (LSTM) with bottlenecking.
+    Args:
+      inputs: Input tensor at the current timestep.
+      state: Tuple of tensors, the state and output at the previous timestep.
+      scope: Optional scope.
+    Returns:
+      A tuple where the first element is the LSTM output and the second is
+      a LSTMStateTuple of the state at the current timestep.
+    """
+    scope = scope or 'conv_lstm_cell'
+    with tf.variable_scope(scope):
+      c, h = state
+      # unflatten state if neccesary
+      if self._flattened_state:
+        c = tf.reshape(c, [-1] + self.output_size)
+        h = tf.reshape(h, [-1] + self.output_size)
+      # summary of input passed into cell
+      if self._viz_gates:
+        slim.summaries.add_histogram_summary(inputs, 'cell_input')
+      bottleneck = tf.contrib.layers.separable_conv2d(
+          tf.concat([inputs, h], 3),
+          self._num_units,
+          self._filter_size,
+          depth_multiplier=1,
+          activation_fn=self._activation,
+          normalizer_fn=None,
+          scope='bottleneck')
+      if self._viz_gates:
+        slim.summaries.add_histogram_summary(bottleneck, 'bottleneck')
+      concat = tf.contrib.layers.separable_conv2d(
+          bottleneck,
+          4 * self._num_units,
+          self._filter_size,
+          depth_multiplier=1,
+          activation_fn=None,
+          normalizer_fn=None,
+          scope='gates')
+      i, j, f, o = tf.split(concat, 4, 3)
+      new_c = (
+          c * tf.sigmoid(f + self._forget_bias) +
+          tf.sigmoid(i) * self._activation(j))
+      new_h = self._activation(new_c) * tf.sigmoid(o)
+      # summary of cell output and new state
+      if self._viz_gates:
+        slim.summaries.add_histogram_summary(new_h, 'cell_output')
+        slim.summaries.add_histogram_summary(new_c, 'cell_state')
+      # reflatten state to store it
+      if self._flattened_state:
+        new_c = tf.reshape(new_c, [-1, self._param_count])
+        new_h = tf.reshape(new_h, [-1, self._param_count])
+      return new_h, tf.contrib.rnn.LSTMStateTuple(
+          new_c, new_h if self._flattened_state else new_h)
+  def init_state(self, state_name, batch_size, dtype, learned_state=False):
+    """Creates an initial state compatible with this cell.
+    Args:
+      state_name: name of the state tensor
+      batch_size: model batch size
+      dtype: dtype for the tensor values i.e. tf.float32
+      learned_state: whether the initial state should be learnable. If false,
+        the initial state is set to all 0's
+    Returns:
+      The created initial state.
+    """
+    state_size = (
+        self.state_size_flat if self._flattened_state else self.state_size)
+    # list of 2 zero tensors or variables tensors, depending on if
+    # learned_state is true
+    ret_flat = [(variables.model_variable(
+        state_name + str(i),
+        shape=s,
+        dtype=dtype,
+        initializer=tf.truncated_normal_initializer(stddev=0.03))
+                 if learned_state else tf.zeros(
+                     [batch_size] + s, dtype=dtype, name=state_name))
+                for i, s in enumerate(state_size)]
+    # duplicates initial state across the batch axis if it's learned
+    if learned_state:
+      ret_flat = [
+          tf.stack([tensor
+                    for i in range(int(batch_size))])
+          for tensor in ret_flat
+      ]
+    for s, r in zip(state_size, ret_flat):
+      r.set_shape([None] + s)
+    return tf.nest.pack_sequence_as(structure=[1, 1], flat_sequence=ret_flat)
--- a/research/lstm_object_detection/lstm/lstm_meta_arch.py
+++ b/research/lstm_object_detection/lstm/lstm_meta_arch.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""LSTM Meta-architecture definition.
+General tensorflow implementation of convolutional Multibox/SSD detection
+models with LSTM states, for use on video data.
+See https://arxiv.org/abs/1711.06368 for details.
+"""
+import re
+import tensorflow as tf
+from google3.third_party.tensorflow_models.object_detection.core import box_list_ops
+from google3.third_party.tensorflow_models.object_detection.core import standard_fields as fields
+from google3.third_party.tensorflow_models.object_detection.meta_architectures import ssd_meta_arch
+from google3.third_party.tensorflow_models.object_detection.utils import ops
+from google3.third_party.tensorflow_models.object_detection.utils import shape_utils
+slim = tf.contrib.slim
+class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
+  """LSTM Meta-architecture definition."""
+  def __init__(self,
+               is_training,
+               anchor_generator,
+               box_predictor,
+               box_coder,
+               feature_extractor,
+               matcher,
+               region_similarity_calculator,
+               encode_background_as_zeros,
+               negative_class_weight,
+               image_resizer_fn,
+               non_max_suppression_fn,
+               score_conversion_fn,
+               classification_loss,
+               localization_loss,
+               classification_loss_weight,
+               localization_loss_weight,
+               normalize_loss_by_num_matches,
+               hard_example_miner,
+               unroll_length,
+               add_summaries=True):
+    super(LSTMMetaArch, self).__init__(
+        is_training, anchor_generator, box_predictor, box_coder,
+        feature_extractor, matcher, region_similarity_calculator,
+        encode_background_as_zeros, negative_class_weight, image_resizer_fn,
+        non_max_suppression_fn, score_conversion_fn, classification_loss,
+        localization_loss, classification_loss_weight, localization_loss_weight,
+        normalize_loss_by_num_matches, hard_example_miner, add_summaries)
+    self._unroll_length = unroll_length
+  @property
+  def unroll_length(self):
+    return self._unroll_length
+  @unroll_length.setter
+  def unroll_length(self, unroll_length):
+    self._unroll_length = unroll_length
+  def predict(self, preprocessed_inputs, true_image_shapes, states=None,
+              state_name='lstm_state', feature_scope=None):
+    with tf.variable_scope(self._extract_features_scope,
+                           values=[preprocessed_inputs], reuse=tf.AUTO_REUSE):
+      feature_maps = self._feature_extractor.extract_features(
+          preprocessed_inputs, states, state_name,
+          unroll_length=self._unroll_length, scope=feature_scope)
+    feature_map_spatial_dims = self._get_feature_map_spatial_dims(feature_maps)
+    image_shape = shape_utils.combined_static_and_dynamic_shape(
+        preprocessed_inputs)
+    self._batch_size = preprocessed_inputs.shape[0].value / self._unroll_length
+    self._states = states
+    self._anchors = box_list_ops.concatenate(
+        self._anchor_generator.generate(
+            feature_map_spatial_dims,
+            im_height=image_shape[1],
+            im_width=image_shape[2]))
+    prediction_dict = self._box_predictor.predict(
+        feature_maps, self._anchor_generator.num_anchors_per_location())
+    # Multiscale_anchor_generator currently has a different dim compared to
+    # ssd_anchor_generator. Current fix is to check the dim of the box_encodings
+    # tensor. If dim is not 3(multiscale_anchor_generator), squeeze the 3rd dim.
+    # TODO(yinxiao): Remove this check once the anchor generator has unified
+    # dimension.
+    if len(prediction_dict['box_encodings'][0].get_shape().as_list()) == 3:
+      box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1)
+    else:
+      box_encodings = tf.squeeze(
+          tf.concat(prediction_dict['box_encodings'], axis=1), axis=2)
+    class_predictions_with_background = tf.concat(
+        prediction_dict['class_predictions_with_background'], axis=1)
+    predictions_dict = {
+        'preprocessed_inputs': preprocessed_inputs,
+        'box_encodings': box_encodings,
+        'class_predictions_with_background': class_predictions_with_background,
+        'feature_maps': feature_maps,
+        'anchors': self._anchors.get(),
+        'states_and_outputs': self._feature_extractor.states_and_outputs,
+    }
+    # In cases such as exporting the model, the states is always zero. Thus the
+    # step should be ignored.
+    if states is not None:
+      predictions_dict['step'] = self._feature_extractor.step
+    return predictions_dict
+  def loss(self, prediction_dict, true_image_shapes, scope=None):
+    """Computes scalar loss tensors with respect to provided groundtruth.
+    Calling this function requires that groundtruth tensors have been
+    provided via the provide_groundtruth function.
+    Args:
+      prediction_dict: a dictionary holding prediction tensors with
+        1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
+          box_code_dimension] containing predicted boxes.
+        2) class_predictions_with_background: 3-D float tensor of shape
+          [batch_size, num_anchors, num_classes+1] containing class predictions
+          (logits) for each of the anchors. Note that this tensor *includes*
+          background class predictions.
+      true_image_shapes: int32 tensor of shape [batch, 3] where each row is
+        of the form [height, width, channels] indicating the shapes
+        of true images in the resized images, as resized images can be padded
+        with zeros.
+      scope: Optional scope name.
+    Returns:
+      a dictionary mapping loss keys (`localization_loss` and
+        `classification_loss`) to scalar tensors representing corresponding loss
+        values.
+    """
+    with tf.name_scope(scope, 'Loss', prediction_dict.values()):
+      keypoints = None
+      if self.groundtruth_has_field(fields.BoxListFields.keypoints):
+        keypoints = self.groundtruth_lists(fields.BoxListFields.keypoints)
+      weights = None
+      if self.groundtruth_has_field(fields.BoxListFields.weights):
+        weights = self.groundtruth_lists(fields.BoxListFields.weights)
+      (batch_cls_targets, batch_cls_weights, batch_reg_targets,
+       batch_reg_weights, match_list) = self._assign_targets(
+           self.groundtruth_lists(fields.BoxListFields.boxes),
+           self.groundtruth_lists(fields.BoxListFields.classes),
+           keypoints, weights)
+      if self._add_summaries:
+        self._summarize_target_assignment(
+            self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
+      location_losses = self._localization_loss(
+          prediction_dict['box_encodings'],
+          batch_reg_targets,
+          ignore_nan_targets=True,
+          weights=batch_reg_weights)
+      cls_losses = ops.reduce_sum_trailing_dimensions(
+          self._classification_loss(
+              prediction_dict['class_predictions_with_background'],
+              batch_cls_targets,
+              weights=batch_cls_weights),
+          ndims=2)
+      if self._hard_example_miner:
+        (loc_loss_list, cls_loss_list) = self._apply_hard_mining(
+            location_losses, cls_losses, prediction_dict, match_list)
+        localization_loss = tf.reduce_sum(tf.stack(loc_loss_list))
+        classification_loss = tf.reduce_sum(tf.stack(cls_loss_list))
+        if self._add_summaries:
+          self._hard_example_miner.summarize()
+      else:
+        if self._add_summaries:
+          class_ids = tf.argmax(batch_cls_targets, axis=2)
+          flattened_class_ids = tf.reshape(class_ids, [-1])
+          flattened_classification_losses = tf.reshape(cls_losses, [-1])
+          self._summarize_anchor_classification_loss(
+              flattened_class_ids, flattened_classification_losses)
+        localization_loss = tf.reduce_sum(location_losses)
+        classification_loss = tf.reduce_sum(cls_losses)
+      # Optionally normalize by number of positive matches
+      normalizer = tf.constant(1.0, dtype=tf.float32)
+      if self._normalize_loss_by_num_matches:
+        normalizer = tf.maximum(tf.to_float(tf.reduce_sum(batch_reg_weights)),
+                                1.0)
+      with tf.name_scope('localization_loss'):
+        localization_loss_normalizer = normalizer
+        if self._normalize_loc_loss_by_codesize:
+          localization_loss_normalizer *= self._box_coder.code_size
+        localization_loss = ((self._localization_loss_weight / (
+            localization_loss_normalizer)) * localization_loss)
+      with tf.name_scope('classification_loss'):
+        classification_loss = ((self._classification_loss_weight / normalizer) *
+                               classification_loss)
+      loss_dict = {
+          'localization_loss': localization_loss,
+          'classification_loss': classification_loss
+      }
+    return loss_dict
+  def restore_map(self, fine_tune_checkpoint_type='lstm'):
+    """Returns a map of variables to load from a foreign checkpoint.
+    See parent class for details.
+    Args:
+      fine_tune_checkpoint_type: the type of checkpoint to restore from, either
+        SSD/LSTM detection checkpoint (with compatible variable names)
+        classification checkpoint for initialization prior to training.
+        Available options: `classification`, `detection`, `interleaved`,
+        and `lstm`.
+    Returns:
+      A dict mapping variable names (to load from a checkpoint) to variables in
+      the model graph.
+    Raises:
+      ValueError: if fine_tune_checkpoint_type is not among
+      `classification`/`detection`/`interleaved`/`lstm`.
+    """
+    if fine_tune_checkpoint_type not in [
+        'classification', 'detection', 'lstm'
+    ]:
+      raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
+          fine_tune_checkpoint_type))
+    variables_to_restore = {}
+    for variable in tf.global_variables():
+      var_name = variable.op.name
+      if 'global_step' in var_name:
+        continue
+      # Remove FeatureExtractor prefix for classification checkpoints.
+      if fine_tune_checkpoint_type == 'classification':
+        var_name = (
+            re.split('^' + self._extract_features_scope + '/', var_name)[-1])
+      # When loading from single frame detection checkpoints, we need to
+      # remap FeatureMaps variable names.
+      if ('FeatureMaps' in var_name and
+          fine_tune_checkpoint_type == 'detection'):
+        var_name = var_name.replace('FeatureMaps',
+                                    self.get_base_network_scope())
+      variables_to_restore[var_name] = variable
+    return variables_to_restore
+  def get_base_network_scope(self):
+    """Returns the variable scope of the base network.
+    Returns:
+      The variable scope of the feature extractor base network, e.g. MobilenetV1
+    """
+    return self._feature_extractor.get_base_network_scope()
+class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
+  """LSTM Meta-architecture  Feature Extractor definition."""
+  @property
+  def depth_multipliers(self):
+    return self._depth_multipliers
+  @depth_multipliers.setter
+  def depth_multipliers(self, depth_multipliers):
+    self._depth_multipliers = depth_multipliers
+  @property
+  def lstm_state_depth(self):
+    return self._lstm_state_depth
+  @lstm_state_depth.setter
+  def lstm_state_depth(self, lstm_state_depth):
+    self._lstm_state_depth = lstm_state_depth
+  @property
+  def states_and_outputs(self):
+    """LSTM states and outputs.
+    This variable includes both LSTM states {C_t} and outputs {h_t}.
+    Returns:
+      states_and_outputs: A list of 4-D float tensors, including the lstm state
+        and output at each timestep.
+    """
+    return self._states_out
+  @property
+  def step(self):
+    return self._step
+  def preprocess(self, resized_inputs):
+    """SSD preprocessing.
+    Maps pixel values to the range [-1, 1].
+    Args:
+      resized_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    Returns:
+      preprocessed_inputs: a [batch, height, width, channels] float tensor
+        representing a batch of images.
+    """
+    return (2.0 / 255.0) * resized_inputs - 1.0
+  def get_base_network_scope(self):
+    """Returns the variable scope of the base network.
+    Returns:
+      The variable scope of the base network, e.g. MobilenetV1
+    """
+    return self._base_network_scope
--- a/research/lstm_object_detection/lstm/rnn_decoder.py
+++ b/research/lstm_object_detection/lstm/rnn_decoder.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Custom RNN decoder."""
+from tensorflow.python.ops import variable_scope
+def rnn_decoder(decoder_inputs,
+                initial_state,
+                cell,
+                loop_function=None,
+                scope=None):
+  """RNN decoder for the sequence-to-sequence model.
+  This decoder returns a list of all states, rather than only the final state.
+  Args:
+    decoder_inputs: A list of 4D Tensors with shape [batch_size x input_size].
+    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
+    cell: rnn_cell.RNNCell defining the cell function and size.
+    loop_function: If not None, this function will be applied to the i-th output
+      in order to generate the i+1-st input, and decoder_inputs will be ignored,
+      except for the first element ("GO" symbol). This can be used for decoding,
+      but also for training to emulate http://arxiv.org/abs/1506.03099.
+      Signature -- loop_function(prev, i) = next
+        * prev is a 2D Tensor of shape [batch_size x output_size],
+        * i is an integer, the step number (when advanced control is needed),
+        * next is a 2D Tensor of shape [batch_size x input_size].
+    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
+  Returns:
+    A tuple of the form (outputs, state), where:
+      outputs: A list of the same length as decoder_inputs of 4D Tensors with
+        shape [batch_size x output_size] containing generated outputs.
+      state: A list of the same length as decoder_inputs of the state of each
+        cell at each time-step. It is a 2D Tensor of shape
+        [batch_size x cell.state_size].
+  """
+  with variable_scope.variable_scope(scope or 'rnn_decoder'):
+    state = initial_state
+    outputs = []
+    states = []
+    prev = None
+    for i, decoder_input in enumerate(decoder_inputs):
+      if loop_function is not None and prev is not None:
+        with variable_scope.variable_scope('loop_function', reuse=True):
+          decoder_input = loop_function(prev, i)
+      if i > 0:
+        variable_scope.get_variable_scope().reuse_variables()
+      output, state = cell(decoder_input, state)
+      outputs.append(output)
+      states.append(state)
+      if loop_function is not None:
+        prev = output
+  return outputs, states
--- a/research/lstm_object_detection/metrics/__init__.py
+++ b/research/lstm_object_detection/metrics/__init__.py
--- a/research/lstm_object_detection/metrics/coco_evaluation_all_frames.py
+++ b/research/lstm_object_detection/metrics/coco_evaluation_all_frames.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class for evaluating video object detections with COCO metrics."""
+import tensorflow as tf
+from google3.third_party.tensorflow_models.object_detection.core import standard_fields
+from google3.third_party.tensorflow_models.object_detection.metrics import coco_evaluation
+from google3.third_party.tensorflow_models.object_detection.metrics import coco_tools
+class CocoEvaluationAllFrames(coco_evaluation.CocoDetectionEvaluator):
+  """Class to evaluate COCO detection metrics for frame sequences.
+  The class overrides two functions: add_single_ground_truth_image_info and
+  add_single_detected_image_info.
+  For the evaluation of sequence video detection, by iterating through the
+  entire groundtruth_dict, all the frames in the unrolled frames in one LSTM
+  training sample are considered. Therefore, both groundtruth and detection
+  results of all frames are added for the evaluation. This is used when all the
+  frames are labeled in the video object detection training job.
+  """
+  def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
+    """Add groundtruth results of all frames to the eval pipeline.
+    This method overrides the function defined in the base class.
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      groundtruth_dict: A list of dictionary containing -
+        InputDataFields.groundtruth_boxes: float32 numpy array of shape
+          [num_boxes, 4] containing `num_boxes` groundtruth boxes of the format
+          [ymin, xmin, ymax, xmax] in absolute image coordinates.
+        InputDataFields.groundtruth_classes: integer numpy array of shape
+          [num_boxes] containing 1-indexed groundtruth classes for the boxes.
+        InputDataFields.groundtruth_is_crowd (optional): integer numpy array of
+          shape [num_boxes] containing iscrowd flag for groundtruth boxes.
+    """
+    for idx, gt in enumerate(groundtruth_dict):
+      if not gt:
+        continue
+      image_frame_id = '{}_{}'.format(image_id, idx)
+      if image_frame_id in self._image_ids:
+        tf.logging.warning(
+            'Ignoring ground truth with image id %s since it was '
+            'previously added', image_frame_id)
+        continue
+      self._groundtruth_list.extend(
+          coco_tools.ExportSingleImageGroundtruthToCoco(
+              image_id=image_frame_id,
+              next_annotation_id=self._annotation_id,
+              category_id_set=self._category_id_set,
+              groundtruth_boxes=gt[
+                  standard_fields.InputDataFields.groundtruth_boxes],
+              groundtruth_classes=gt[
+                  standard_fields.InputDataFields.groundtruth_classes]))
+      self._annotation_id += (
+          gt[standard_fields.InputDataFields.groundtruth_boxes].shape[0])
+      # Boolean to indicate whether a detection has been added for this image.
+      self._image_ids[image_frame_id] = False
+  def add_single_detected_image_info(self, image_id, detections_dict):
+    """Add detection results of all frames to the eval pipeline.
+    This method overrides the function defined in the base class.
+    Args:
+      image_id: A unique string/integer identifier for the image.
+      detections_dict: A list of dictionary containing -
+        DetectionResultFields.detection_boxes: float32 numpy array of shape
+          [num_boxes, 4] containing `num_boxes` detection boxes of the format
+          [ymin, xmin, ymax, xmax] in absolute image coordinates.
+        DetectionResultFields.detection_scores: float32 numpy array of shape
+          [num_boxes] containing detection scores for the boxes.
+        DetectionResultFields.detection_classes: integer numpy array of shape
+          [num_boxes] containing 1-indexed detection classes for the boxes.
+    Raises:
+      ValueError: If groundtruth for the image_id is not available.
+    """
+    for idx, det in enumerate(detections_dict):
+      if not det:
+        continue
+      image_frame_id = '{}_{}'.format(image_id, idx)
+      if image_frame_id not in self._image_ids:
+        raise ValueError(
+            'Missing groundtruth for image-frame id: {}'.format(image_frame_id))
+      if self._image_ids[image_frame_id]:
+        tf.logging.warning(
+            'Ignoring detection with image id %s since it was '
+            'previously added', image_frame_id)
+        continue
+      self._detection_boxes_list.extend(
+          coco_tools.ExportSingleImageDetectionBoxesToCoco(
+              image_id=image_frame_id,
+              category_id_set=self._category_id_set,
+              detection_boxes=det[
+                  standard_fields.DetectionResultFields.detection_boxes],
+              detection_scores=det[
+                  standard_fields.DetectionResultFields.detection_scores],
+              detection_classes=det[
+                  standard_fields.DetectionResultFields.detection_classes]))
+      self._image_ids[image_frame_id] = True
--- a/research/lstm_object_detection/metrics/coco_evaluation_all_frames_test.py
+++ b/research/lstm_object_detection/metrics/coco_evaluation_all_frames_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for video_object_detection.metrics.coco_video_evaluation."""
+import numpy as np
+import tensorflow as tf
+from lstm_object_detection.metrics import coco_evaluation_all_frames
+from google3.third_party.tensorflow_models.object_detection.core import standard_fields
+class CocoEvaluationAllFramesTest(tf.test.TestCase):
+  def testGroundtruthAndDetectionsDisagreeOnAllFrames(self):
+    """Tests that mAP is calculated on several different frame results."""
+    category_list = [{'id': 0, 'name': 'dog'}, {'id': 1, 'name': 'cat'}]
+    video_evaluator = coco_evaluation_all_frames.CocoEvaluationAllFrames(
+        category_list)
+    video_evaluator.add_single_ground_truth_image_info(
+        image_id='image1',
+        groundtruth_dict=[{
+            standard_fields.InputDataFields.groundtruth_boxes:
+                np.array([[50., 50., 200., 200.]]),
+            standard_fields.InputDataFields.groundtruth_classes:
+                np.array([1])
+        }, {
+            standard_fields.InputDataFields.groundtruth_boxes:
+                np.array([[50., 50., 100., 100.]]),
+            standard_fields.InputDataFields.groundtruth_classes:
+                np.array([1])
+        }])
+    video_evaluator.add_single_detected_image_info(
+        image_id='image1',
+        # A different groundtruth box on the frame other than the last one.
+        detections_dict=[{
+            standard_fields.DetectionResultFields.detection_boxes:
+                np.array([[100., 100., 200., 200.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+                np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+                np.array([1])
+        }, {
+            standard_fields.DetectionResultFields.detection_boxes:
+                np.array([[50., 50., 100., 100.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+                np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+                np.array([1])
+        }])
+    metrics = video_evaluator.evaluate()
+    self.assertNotEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
+  def testGroundtruthAndDetections(self):
+    """Tests that mAP is calculated correctly on GT and Detections."""
+    category_list = [{'id': 0, 'name': 'dog'}, {'id': 1, 'name': 'cat'}]
+    video_evaluator = coco_evaluation_all_frames.CocoEvaluationAllFrames(
+        category_list)
+    video_evaluator.add_single_ground_truth_image_info(
+        image_id='image1',
+        groundtruth_dict=[{
+            standard_fields.InputDataFields.groundtruth_boxes:
+                np.array([[100., 100., 200., 200.]]),
+            standard_fields.InputDataFields.groundtruth_classes:
+                np.array([1])
+        }])
+    video_evaluator.add_single_ground_truth_image_info(
+        image_id='image2',
+        groundtruth_dict=[{
+            standard_fields.InputDataFields.groundtruth_boxes:
+                np.array([[50., 50., 100., 100.]]),
+            standard_fields.InputDataFields.groundtruth_classes:
+                np.array([1])
+        }])
+    video_evaluator.add_single_ground_truth_image_info(
+        image_id='image3',
+        groundtruth_dict=[{
+            standard_fields.InputDataFields.groundtruth_boxes:
+                np.array([[50., 100., 100., 120.]]),
+            standard_fields.InputDataFields.groundtruth_classes:
+                np.array([1])
+        }])
+    video_evaluator.add_single_detected_image_info(
+        image_id='image1',
+        detections_dict=[{
+            standard_fields.DetectionResultFields.detection_boxes:
+                np.array([[100., 100., 200., 200.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+                np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+                np.array([1])
+        }])
+    video_evaluator.add_single_detected_image_info(
+        image_id='image2',
+        detections_dict=[{
+            standard_fields.DetectionResultFields.detection_boxes:
+                np.array([[50., 50., 100., 100.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+                np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+                np.array([1])
+        }])
+    video_evaluator.add_single_detected_image_info(
+        image_id='image3',
+        detections_dict=[{
+            standard_fields.DetectionResultFields.detection_boxes:
+                np.array([[50., 100., 100., 120.]]),
+            standard_fields.DetectionResultFields.detection_scores:
+                np.array([.8]),
+            standard_fields.DetectionResultFields.detection_classes:
+                np.array([1])
+        }])
+    metrics = video_evaluator.evaluate()
+    self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
+  def testMissingDetectionResults(self):
+    """Tests if groundtrue is missing, raises ValueError."""
+    category_list = [{'id': 0, 'name': 'dog'}]
+    video_evaluator = coco_evaluation_all_frames.CocoEvaluationAllFrames(
+        category_list)
+    video_evaluator.add_single_ground_truth_image_info(
+        image_id='image1',
+        groundtruth_dict=[{
+            standard_fields.InputDataFields.groundtruth_boxes:
+                np.array([[100., 100., 200., 200.]]),
+            standard_fields.InputDataFields.groundtruth_classes:
+                np.array([1])
+        }])
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Missing groundtruth for image-frame id:.*'):
+      video_evaluator.add_single_detected_image_info(
+          image_id='image3',
+          detections_dict=[{
+              standard_fields.DetectionResultFields.detection_boxes:
+                  np.array([[100., 100., 200., 200.]]),
+              standard_fields.DetectionResultFields.detection_scores:
+                  np.array([.8]),
+              standard_fields.DetectionResultFields.detection_classes:
+                  np.array([1])
+          }])
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/lstm_object_detection/model_builder.py
+++ b/research/lstm_object_detection/model_builder.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A function to build a DetectionModel from configuration."""
+from lstm_object_detection.lstm import lstm_meta_arch
+from lstm_object_detection.models.lstm_ssd_mobilenet_v1_feature_extractor import LSTMMobileNetV1FeatureExtractor
+from google3.third_party.tensorflow_models.object_detection.builders import anchor_generator_builder
+from google3.third_party.tensorflow_models.object_detection.builders import box_coder_builder
+from google3.third_party.tensorflow_models.object_detection.builders import box_predictor_builder
+from google3.third_party.tensorflow_models.object_detection.builders import hyperparams_builder
+from google3.third_party.tensorflow_models.object_detection.builders import image_resizer_builder
+from google3.third_party.tensorflow_models.object_detection.builders import losses_builder
+from google3.third_party.tensorflow_models.object_detection.builders import matcher_builder
+from google3.third_party.tensorflow_models.object_detection.builders import model_builder
+from google3.third_party.tensorflow_models.object_detection.builders import post_processing_builder
+from google3.third_party.tensorflow_models.object_detection.builders import region_similarity_calculator_builder as sim_calc
+model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP.update({
+    'lstm_mobilenet_v1': LSTMMobileNetV1FeatureExtractor,
+})
+SSD_FEATURE_EXTRACTOR_CLASS_MAP = model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP
+def build(model_config, lstm_config, is_training):
+  """Builds a DetectionModel based on the model config.
+  Args:
+    model_config: A model.proto object containing the config for the desired
+      DetectionModel.
+    lstm_config: LstmModel config proto that specifies LSTM train/eval configs.
+    is_training: True if this model is being built for training purposes.
+  Returns:
+    DetectionModel based on the config.
+  Raises:
+    ValueError: On invalid meta architecture or model.
+  """
+  return _build_lstm_model(model_config.ssd, lstm_config, is_training)
+def _build_lstm_feature_extractor(feature_extractor_config,
+                                  is_training,
+                                  lstm_state_depth,
+                                  reuse_weights=None):
+  """Builds a ssd_meta_arch.SSDFeatureExtractor based on config.
+  Args:
+    feature_extractor_config: A SSDFeatureExtractor proto config from ssd.proto.
+    is_training: True if this feature extractor is being built for training.
+    lstm_state_depth: An integer of the depth of the lstm state.
+    reuse_weights: If the feature extractor should reuse weights.
+  Returns:
+    ssd_meta_arch.SSDFeatureExtractor based on config.
+  Raises:
+    ValueError: On invalid feature extractor type.
+  """
+  feature_type = feature_extractor_config.type
+  depth_multiplier = feature_extractor_config.depth_multiplier
+  min_depth = feature_extractor_config.min_depth
+  pad_to_multiple = feature_extractor_config.pad_to_multiple
+  use_explicit_padding = feature_extractor_config.use_explicit_padding
+  use_depthwise = feature_extractor_config.use_depthwise
+  conv_hyperparams = hyperparams_builder.build(
+      feature_extractor_config.conv_hyperparams, is_training)
+  override_base_feature_extractor_hyperparams = (
+      feature_extractor_config.override_base_feature_extractor_hyperparams)
+  if feature_type not in SSD_FEATURE_EXTRACTOR_CLASS_MAP:
+    raise ValueError('Unknown ssd feature_extractor: {}'.format(feature_type))
+  feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type]
+  return feature_extractor_class(
+      is_training, depth_multiplier, min_depth, pad_to_multiple,
+      conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise,
+      override_base_feature_extractor_hyperparams, lstm_state_depth)
+def _build_lstm_model(ssd_config, lstm_config, is_training):
+  """Builds an LSTM detection model based on the model config.
+  Args:
+    ssd_config: A ssd.proto object containing the config for the desired
+      LSTMMetaArch.
+    lstm_config: LstmModel config proto that specifies LSTM train/eval configs.
+    is_training: True if this model is being built for training purposes.
+  Returns:
+    LSTMMetaArch based on the config.
+  Raises:
+    ValueError: If ssd_config.type is not recognized (i.e. not registered in
+      model_class_map), or if lstm_config.interleave_strategy is not recognized.
+    ValueError: If unroll_length is not specified in the config file.
+  """
+  feature_extractor = _build_lstm_feature_extractor(
+      ssd_config.feature_extractor, is_training, lstm_config.lstm_state_depth)
+  box_coder = box_coder_builder.build(ssd_config.box_coder)
+  matcher = matcher_builder.build(ssd_config.matcher)
+  region_similarity_calculator = sim_calc.build(
+      ssd_config.similarity_calculator)
+  num_classes = ssd_config.num_classes
+  ssd_box_predictor = box_predictor_builder.build(hyperparams_builder.build,
+                                                  ssd_config.box_predictor,
+                                                  is_training, num_classes)
+  anchor_generator = anchor_generator_builder.build(ssd_config.anchor_generator)
+  image_resizer_fn = image_resizer_builder.build(ssd_config.image_resizer)
+  non_max_suppression_fn, score_conversion_fn = post_processing_builder.build(
+      ssd_config.post_processing)
+  (classification_loss, localization_loss, classification_weight,
+   localization_weight, miner, _) = losses_builder.build(ssd_config.loss)
+  normalize_loss_by_num_matches = ssd_config.normalize_loss_by_num_matches
+  encode_background_as_zeros = ssd_config.encode_background_as_zeros
+  negative_class_weight = ssd_config.negative_class_weight
+  # Extra configs for lstm unroll length.
+  unroll_length = None
+  if 'lstm' in ssd_config.feature_extractor.type:
+    if is_training:
+      unroll_length = lstm_config.train_unroll_length
+    else:
+      unroll_length = lstm_config.eval_unroll_length
+  if unroll_length is None:
+    raise ValueError('No unroll length found in the config file')
+  lstm_model = lstm_meta_arch.LSTMMetaArch(
+      is_training, anchor_generator, ssd_box_predictor, box_coder,
+      feature_extractor, matcher, region_similarity_calculator,
+      encode_background_as_zeros, negative_class_weight, image_resizer_fn,
+      non_max_suppression_fn, score_conversion_fn, classification_loss,
+      localization_loss, classification_weight, localization_weight,
+      normalize_loss_by_num_matches, miner, unroll_length)
+  return lstm_model
--- a/research/lstm_object_detection/model_builder_test.py
+++ b/research/lstm_object_detection/model_builder_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for video_object_detection.tensorflow.model_builder."""
+import tensorflow as tf
+from google.protobuf import text_format
+from lstm_object_detection import model_builder
+from lstm_object_detection.lstm import lstm_meta_arch
+from lstm_object_detection.protos import pipeline_pb2 as internal_pipeline_pb2
+from google3.third_party.tensorflow_models.object_detection.protos import pipeline_pb2
+class ModelBuilderTest(tf.test.TestCase):
+  def create_model(self, model_config, lstm_config):
+    """Builds a DetectionModel based on the model config.
+    Args:
+      model_config: A model.proto object containing the config for the desired
+        DetectionModel.
+      lstm_config: LstmModel config proto that specifies LSTM train/eval
+        configs.
+    Returns:
+      DetectionModel based on the config.
+    """
+    return model_builder.build(model_config, lstm_config, is_training=True)
+  def get_model_configs_from_proto(self):
+    """Creates a model text proto for testing.
+    Returns:
+      A dictionary of model configs.
+    """
+    model_text_proto = """
+    [object_detection.protos.lstm_model] {
+      train_unroll_length: 4
+      eval_unroll_length: 4
+    }
+    model {
+      ssd {
+        feature_extractor {
+          type: 'lstm_mobilenet_v1'
+          conv_hyperparams {
+            regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+          }
+        }
+        negative_class_weight: 2.0
+        box_coder {
+          faster_rcnn_box_coder {
+          }
+        }
+        matcher {
+          argmax_matcher {
+          }
+        }
+        similarity_calculator {
+          iou_similarity {
+          }
+        }
+        anchor_generator {
+          ssd_anchor_generator {
+            aspect_ratios: 1.0
+          }
+        }
+        image_resizer {
+          fixed_shape_resizer {
+            height: 320
+            width: 320
+          }
+        }
+        box_predictor {
+          convolutional_box_predictor {
+            conv_hyperparams {
+              regularizer {
+                l2_regularizer {
+                }
+              }
+              initializer {
+                truncated_normal_initializer {
+                }
+              }
+            }
+          }
+        }
+        normalize_loc_loss_by_codesize: true
+        loss {
+          classification_loss {
+            weighted_softmax {
+            }
+          }
+          localization_loss {
+            weighted_smooth_l1 {
+            }
+          }
+        }
+      }
+    }"""
+    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
+    text_format.Merge(model_text_proto, pipeline_config)
+    configs = {}
+    configs['model'] = pipeline_config.model
+    configs['lstm_model'] = pipeline_config.Extensions[
+        internal_pipeline_pb2.lstm_model]
+    return configs
+  def test_model_creation_from_valid_configs(self):
+    configs = self.get_model_configs_from_proto()
+    # Test model properties.
+    self.assertEqual(configs['model'].ssd.negative_class_weight, 2.0)
+    self.assertTrue(configs['model'].ssd.normalize_loc_loss_by_codesize)
+    self.assertEqual(configs['model'].ssd.feature_extractor.type,
+                     'lstm_mobilenet_v1')
+    model = self.create_model(configs['model'], configs['lstm_model'])
+    # Test architechture type.
+    self.assertIsInstance(model, lstm_meta_arch.LSTMMetaArch)
+    # Test LSTM unroll length.
+    self.assertEqual(model.unroll_length, 4)
+  def test_model_creation_from_invalid_configs(self):
+    configs = self.get_model_configs_from_proto()
+    # Test model build failure with wrong input configs.
+    with self.assertRaises(AttributeError):
+      _ = self.create_model(configs['model'], configs['model'])
+    # Test model builder failure with missing configs.
+    with self.assertRaises(TypeError):
+      # pylint: disable=no-value-for-parameter
+      _ = self.create_model(configs['lstm_model'])
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/lstm_object_detection/models/__init__.py
+++ b/research/lstm_object_detection/models/__init__.py
--- a/research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor.py
+++ b/research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""LSTMFeatureExtractor for MobilenetV1 features."""
+import tensorflow as tf
+from tensorflow.python.framework import ops as tf_ops
+from lstm_object_detection.lstm import lstm_cells
+from lstm_object_detection.lstm import lstm_meta_arch
+from lstm_object_detection.lstm import rnn_decoder
+from google3.third_party.tensorflow_models.object_detection.models import feature_map_generators
+from google3.third_party.tensorflow_models.object_detection.utils import context_manager
+from google3.third_party.tensorflow_models.object_detection.utils import ops
+from google3.third_party.tensorflow_models.object_detection.utils import shape_utils
+from nets import mobilenet_v1
+slim = tf.contrib.slim
+class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
+  """LSTM Feature Extractor using MobilenetV1 features."""
+  def __init__(self,
+               is_training,
+               depth_multiplier,
+               min_depth,
+               pad_to_multiple,
+               conv_hyperparams,
+               reuse_weights=None,
+               use_explicit_padding=False,
+               use_depthwise=True,
+               override_base_feature_extractor_hyperparams=False,
+               lstm_state_depth=256):
+    """Initializes instance of MobileNetV1 Feature Extractor for LSTM Models.
+    Args:
+      is_training: A boolean whether the network is in training mode.
+      depth_multiplier: A float depth multiplier for feature extractor.
+      min_depth: A number representing minimum feature extractor depth.
+      pad_to_multiple: The nearest multiple to zero pad the input height and
+        width dimensions to.
+      conv_hyperparams: A function to construct tf slim arg_scope for conv2d
+        and separable_conv2d ops in the layers that are added on top of the
+        base feature extractor.
+      reuse_weights: Whether to reuse variables. Default is None.
+      use_explicit_padding: Whether to use explicit padding when extracting
+        features. Default is False.
+      use_depthwise: Whether to use depthwise convolutions. Default is True.
+      override_base_feature_extractor_hyperparams: Whether to override
+        hyperparameters of the base feature extractor with the one from
+        `conv_hyperparams_fn`.
+      lstm_state_depth: An integter of the depth of the lstm state.
+    """
+    super(LSTMMobileNetV1FeatureExtractor, self).__init__(
+        is_training, depth_multiplier, min_depth, pad_to_multiple,
+        conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise,
+        override_base_feature_extractor_hyperparams)
+    self._feature_map_layout = {
+        'from_layer': ['Conv2d_13_pointwise_lstm', '', '', '', ''],
+        'layer_depth': [-1, 512, 256, 256, 128],
+        'use_explicit_padding': self._use_explicit_padding,
+        'use_depthwise': self._use_depthwise,
+    }
+    self._base_network_scope = 'MobilenetV1'
+    self._lstm_state_depth = lstm_state_depth
+  def extract_features(self,
+                       preprocessed_inputs,
+                       state_saver=None,
+                       state_name='lstm_state',
+                       unroll_length=5,
+                       scope=None):
+    """Extracts features from preprocessed inputs.
+    The features include the base network features, lstm features and SSD
+    features, organized in the following name scope:
+    <parent scope>/MobilenetV1/...
+    <parent scope>/LSTM/...
+    <parent scope>/FeatureMaps/...
+    Args:
+      preprocessed_inputs: A [batch, height, width, channels] float tensor
+        representing a batch of consecutive frames from video clips.
+      state_saver: A state saver object with methods `state` and `save_state`.
+      state_name: A python string for the name to use with the state_saver.
+      unroll_length: The number of steps to unroll the lstm.
+      scope: The scope for the base network of the feature extractor.
+    Returns:
+      A list of tensors where the ith tensor has shape [batch, height_i,
+      width_i, depth_i]
+    """
+    preprocessed_inputs = shape_utils.check_min_image_dim(
+        33, preprocessed_inputs)
+    with slim.arg_scope(
+        mobilenet_v1.mobilenet_v1_arg_scope(is_training=self._is_training)):
+      with (slim.arg_scope(self._conv_hyperparams_fn())
+            if self._override_base_feature_extractor_hyperparams else
+            context_manager.IdentityContextManager()):
+        with slim.arg_scope([slim.batch_norm], fused=False):
+          # Base network.
+          with tf.variable_scope(
+              scope, self._base_network_scope,
+              reuse=self._reuse_weights) as scope:
+            net, image_features = mobilenet_v1.mobilenet_v1_base(
+                ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
+                final_endpoint='Conv2d_13_pointwise',
+                min_depth=self._min_depth,
+                depth_multiplier=self._depth_multiplier,
+                scope=scope)
+    with slim.arg_scope(self._conv_hyperparams_fn()):
+      with slim.arg_scope(
+          [slim.batch_norm], fused=False, is_training=self._is_training):
+        # ConvLSTM layers.
+        with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope:
+          lstm_cell = lstm_cells.BottleneckConvLSTMCell(
+              filter_size=(3, 3),
+              output_size=(net.shape[1].value, net.shape[2].value),
+              num_units=max(self._min_depth, self._lstm_state_depth),
+              activation=tf.nn.relu6,
+              visualize_gates=True)
+          net_seq = list(tf.split(net, unroll_length))
+          if state_saver is None:
+            init_state = lstm_cell.init_state(
+                state_name, net.shape[0].value / unroll_length, tf.float32)
+          else:
+            c = state_saver.state('%s_c' % state_name)
+            h = state_saver.state('%s_h' % state_name)
+            init_state = (c, h)
+          # Identities added for inputing state tensors externally.
+          c_ident = tf.identity(init_state[0], name='lstm_state_in_c')
+          h_ident = tf.identity(init_state[1], name='lstm_state_in_h')
+          init_state = (c_ident, h_ident)
+          net_seq, states_out = rnn_decoder.rnn_decoder(
+              net_seq, init_state, lstm_cell, scope=lstm_scope)
+          batcher_ops = None
+          self._states_out = states_out
+          if state_saver is not None:
+            self._step = state_saver.state('%s_step' % state_name)
+            batcher_ops = [
+                state_saver.save_state('%s_c' % state_name, states_out[-1][0]),
+                state_saver.save_state('%s_h' % state_name, states_out[-1][1]),
+                state_saver.save_state('%s_step' % state_name, self._step - 1)
+            ]
+          with tf_ops.control_dependencies(batcher_ops):
+            image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0)
+          # Identities added for reading output states, to be reused externally.
+          tf.identity(states_out[-1][0], name='lstm_state_out_c')
+          tf.identity(states_out[-1][1], name='lstm_state_out_h')
+        # SSD layers.
+        with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights):
+          feature_maps = feature_map_generators.multi_resolution_feature_maps(
+              feature_map_layout=self._feature_map_layout,
+              depth_multiplier=(self._depth_multiplier),
+              min_depth=self._min_depth,
+              insert_1x1_conv=True,
+              image_features=image_features)
+    return feature_maps.values()
--- a/research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor_test.py
+++ b/research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for models.lstm_ssd_mobilenet_v1_feature_extractor."""
+import numpy as np
+import tensorflow as tf
+from lstm_object_detection.models import lstm_ssd_mobilenet_v1_feature_extractor as feature_extactor
+from google3.third_party.tensorflow_models.object_detection.models import ssd_feature_extractor_test
+slim = tf.contrib.slim
+class LstmSsdMobilenetV1FeatureExtractorTest(
+    ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
+  def _create_feature_extractor(self,
+                                depth_multiplier=1.0,
+                                pad_to_multiple=1,
+                                is_training=True,
+                                use_explicit_padding=False):
+    """Constructs a new feature extractor.
+    Args:
+      depth_multiplier: A float depth multiplier for feature extractor.
+      pad_to_multiple: The nearest multiple to zero pad the input height and
+        width dimensions to.
+      is_training: A boolean whether the network is in training mode.
+      use_explicit_padding: A boolean whether to use explicit padding.
+    Returns:
+      An lstm_ssd_meta_arch.LSTMMobileNetV1FeatureExtractor object.
+    """
+    min_depth = 32
+    extractor = (
+        feature_extactor.LSTMMobileNetV1FeatureExtractor(
+            is_training,
+            depth_multiplier,
+            min_depth,
+            pad_to_multiple,
+            self.conv_hyperparams_fn,
+            use_explicit_padding=use_explicit_padding))
+    extractor.lstm_state_depth = int(256 * depth_multiplier)
+    return extractor
+  def test_extract_features_returns_correct_shapes_256(self):
+    image_height = 256
+    image_width = 256
+    depth_multiplier = 1.0
+    pad_to_multiple = 1
+    batch_size = 5
+    expected_feature_map_shape = [(batch_size, 8, 8, 256), (batch_size, 4, 4,
+                                                            512),
+                                  (batch_size, 2, 2, 256), (batch_size, 1, 1,
+                                                            256)]
+    self.check_extract_features_returns_correct_shape(
+        batch_size,
+        image_height,
+        image_width,
+        depth_multiplier,
+        pad_to_multiple,
+        expected_feature_map_shape,
+        use_explicit_padding=False)
+    self.check_extract_features_returns_correct_shape(
+        batch_size,
+        image_height,
+        image_width,
+        depth_multiplier,
+        pad_to_multiple,
+        expected_feature_map_shape,
+        use_explicit_padding=True)
+  def test_preprocess_returns_correct_value_range(self):
+    test_image = np.random.rand(5, 128, 128, 3)
+    feature_extractor = self._create_feature_extractor()
+    preprocessed_image = feature_extractor.preprocess(test_image)
+    self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
+  def test_variables_only_created_in_scope(self):
+    scope_name = 'MobilenetV1'
+    g = tf.Graph()
+    with g.as_default():
+      preprocessed_inputs = tf.placeholder(tf.float32, (5, 256, 256, 3))
+      feature_extractor = self._create_feature_extractor()
+      feature_extractor.extract_features(preprocessed_inputs)
+      variables = g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
+      find_scope = False
+      for variable in variables:
+        if scope_name in variable.name:
+          find_scope = True
+          break
+      self.assertTrue(find_scope)
+  def test_lstm_non_zero_state(self):
+    init_state = {
+        'lstm_state_c': tf.zeros([8, 8, 256]),
+        'lstm_state_h': tf.zeros([8, 8, 256]),
+        'lstm_state_step': tf.zeros([1])
+    }
+    seq = {'test': tf.random_uniform([3, 1, 1, 1])}
+    stateful_reader = tf.contrib.training.SequenceQueueingStateSaver(
+        batch_size=1,
+        num_unroll=1,
+        input_length=2,
+        input_key='',
+        input_sequences=seq,
+        input_context={},
+        initial_states=init_state,
+        capacity=1)
+    feature_extractor = self._create_feature_extractor()
+    image = tf.random_uniform([5, 256, 256, 3])
+    with tf.variable_scope('zero_state'):
+      feature_map = feature_extractor.extract_features(
+          image, stateful_reader.next_batch)
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run([stateful_reader.prefetch_op])
+      _ = sess.run([feature_map])
+      # Update states with the next batch.
+      state = sess.run(stateful_reader.next_batch.state('lstm_state_c'))
+    # State should no longer be zero after update.
+    self.assertTrue(state.any())
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/lstm_object_detection/protos/__init__.py
+++ b/research/lstm_object_detection/protos/__init__.py
--- a/research/lstm_object_detection/protos/input_reader_google.proto
+++ b/research/lstm_object_detection/protos/input_reader_google.proto
+syntax = "proto2";
+package lstm_object_detection.input_readers;
+import "third_party/tensorflow_models/object_detection/protos/input_reader.proto";
+message GoogleInputReader {
+  extend object_detection.protos.ExternalInputReader {
+    optional GoogleInputReader google_input_reader = 444;
+  }
+  oneof input_reader {
+    TFRecordVideoInputReader tf_record_video_input_reader = 1;
+  }
+}
+message TFRecordVideoInputReader {
+  // Path(s) to tfrecords of input data.
+  repeated string input_path = 1;
+  enum DataType {
+    UNSPECIFIED = 0;
+    ANNOTATED_IMAGE = 1;
+    TF_EXAMPLE = 2;
+    TF_SEQUENCE_EXAMPLE = 3;
+  }
+  optional DataType data_type = 2 [default=TF_SEQUENCE_EXAMPLE];
+  // Length of the video sequence. All the input video sequence should have the
+  // same length in frames, e.g. 5 frames.
+  optional int32 video_length = 3;
+}
--- a/research/lstm_object_detection/protos/pipeline.proto
+++ b/research/lstm_object_detection/protos/pipeline.proto
+syntax = "proto2";
+package object_detection.protos;
+import "third_party/tensorflow_models/object_detection/protos/pipeline.proto";
+extend TrainEvalPipelineConfig {
+  optional LstmModel lstm_model = 205743444;
+}
+// Message for extra fields needed for configuring LSTM model.
+message LstmModel {
+  // Unroll length for training LSTMs.
+  optional int32 train_unroll_length = 1;
+  // Unroll length for evaluating LSTMs.
+  optional int32 eval_unroll_length = 2;
+  // Depth of the lstm feature map.
+  optional int32 lstm_state_depth = 3 [default = 256];
+}
--- a/research/lstm_object_detection/seq_dataset_builder.py
+++ b/research/lstm_object_detection/seq_dataset_builder.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""tf.data.Dataset builder.
+Creates data sources for DetectionModels from an InputReader config. See
+input_reader.proto for options.
+Note: If users wishes to also use their own InputReaders with the Object
+Detection configuration framework, they should define their own builder function
+that wraps the build function.
+"""
+import tensorflow as tf
+import tensorflow.google as google_tf
+from google3.learning.brain.contrib.slim.data import parallel_reader
+from tensorflow.contrib.training.python.training import sequence_queueing_state_saver as sqss
+from lstm_object_detection import tf_sequence_example_decoder
+from lstm_object_detection.protos import input_reader_google_pb2
+from google3.third_party.tensorflow_models.object_detection.core import preprocessor
+from google3.third_party.tensorflow_models.object_detection.core import preprocessor_cache
+from google3.third_party.tensorflow_models.object_detection.core import standard_fields as fields
+from google3.third_party.tensorflow_models.object_detection.protos import input_reader_pb2
+from google3.third_party.tensorflow_models.object_detection.utils import ops as util_ops
+# TODO(yinxiao): Make the following variable into configurable proto.
+# Padding size for the labeled objects in each frame. Here we assume each
+# frame has a total number of objects less than _PADDING_SIZE.
+_PADDING_SIZE = 30
+def _build_training_batch_dict(batch_sequences_with_states, unroll_length,
+                               batch_size):
+  """Builds training batch samples.
+  Args:
+    batch_sequences_with_states: A batch_sequences_with_states object.
+    unroll_length: Unrolled length for LSTM training.
+    batch_size: Batch size for queue outputs.
+  Returns:
+    A dictionary of tensors based on items in input_reader_config.
+  """
+  seq_tensors_dict = {
+      fields.InputDataFields.image: [],
+      fields.InputDataFields.groundtruth_boxes: [],
+      fields.InputDataFields.groundtruth_classes: [],
+      'batch': batch_sequences_with_states,
+  }
+  for i in range(unroll_length):
+    for j in range(batch_size):
+      filtered_dict = util_ops.filter_groundtruth_with_nan_box_coordinates({
+          fields.InputDataFields.groundtruth_boxes: (
+              batch_sequences_with_states.sequences['groundtruth_boxes'][j][i]),
+          fields.InputDataFields.groundtruth_classes: (
+              batch_sequences_with_states.sequences['groundtruth_classes'][j][i]
+          ),
+      })
+      filtered_dict = util_ops.retain_groundtruth_with_positive_classes(
+          filtered_dict)
+      seq_tensors_dict[fields.InputDataFields.image].append(
+          batch_sequences_with_states.sequences['image'][j][i])
+      seq_tensors_dict[fields.InputDataFields.groundtruth_boxes].append(
+          filtered_dict[fields.InputDataFields.groundtruth_boxes])
+      seq_tensors_dict[fields.InputDataFields.groundtruth_classes].append(
+          filtered_dict[fields.InputDataFields.groundtruth_classes])
+  seq_tensors_dict[fields.InputDataFields.image] = tuple(
+      seq_tensors_dict[fields.InputDataFields.image])
+  seq_tensors_dict[fields.InputDataFields.groundtruth_boxes] = tuple(
+      seq_tensors_dict[fields.InputDataFields.groundtruth_boxes])
+  seq_tensors_dict[fields.InputDataFields.groundtruth_classes] = tuple(
+      seq_tensors_dict[fields.InputDataFields.groundtruth_classes])
+  return seq_tensors_dict
+def build(input_reader_config,
+          model_config,
+          lstm_config,
+          unroll_length,
+          data_augmentation_options=None,
+          batch_size=1):
+  """Builds a tensor dictionary based on the InputReader config.
+  Args:
+    input_reader_config: An input_reader_builder.InputReader object.
+    model_config: A model.proto object containing the config for the desired
+      DetectionModel.
+    lstm_config: LSTM specific configs.
+    unroll_length: Unrolled length for LSTM training.
+    data_augmentation_options: A list of tuples, where each tuple contains a
+      data augmentation function and a dictionary containing arguments and their
+      values (see preprocessor.py).
+    batch_size: Batch size for queue outputs.
+  Returns:
+    A dictionary of tensors based on items in the input_reader_config.
+  Raises:
+    ValueError: On invalid input reader proto.
+    ValueError: If no input paths are specified.
+  """
+  if not isinstance(input_reader_config, input_reader_pb2.InputReader):
+    raise ValueError('input_reader_config not of type '
+                     'input_reader_pb2.InputReader.')
+  external_reader_config = input_reader_config.external_input_reader
+  google_input_reader_config = external_reader_config.Extensions[
+      input_reader_google_pb2.GoogleInputReader.google_input_reader]
+  input_reader_type = google_input_reader_config.WhichOneof('input_reader')
+  if input_reader_type == 'tf_record_video_input_reader':
+    config = google_input_reader_config.tf_record_video_input_reader
+    reader_type_class = tf.TFRecordReader
+  else:
+    raise ValueError(
+        'Unsupported reader in input_reader_config: %s' % input_reader_type)
+  if not config.input_path:
+    raise ValueError('At least one input path must be specified in '
+                     '`input_reader_config`.')
+  key, value = parallel_reader.parallel_read(
+      config.input_path[:],  # Convert `RepeatedScalarContainer` to list.
+      reader_class=reader_type_class,
+      num_epochs=(input_reader_config.num_epochs
+                  if input_reader_config.num_epochs else None),
+      num_readers=input_reader_config.num_readers,
+      shuffle=input_reader_config.shuffle,
+      dtypes=[tf.string, tf.string],
+      capacity=input_reader_config.queue_capacity,
+      min_after_dequeue=input_reader_config.min_after_dequeue)
+  # TODO(yinxiao): Add loading instance mask option.
+  decoder = tf_sequence_example_decoder.TfSequenceExampleDecoder()
+  keys_to_decode = [
+      fields.InputDataFields.image, fields.InputDataFields.groundtruth_boxes,
+      fields.InputDataFields.groundtruth_classes
+  ]
+  tensor_dict = decoder.decode(value, items=keys_to_decode)
+  tensor_dict['image'].set_shape([None, None, None, 3])
+  tensor_dict['groundtruth_boxes'].set_shape([None, None, 4])
+  height = model_config.ssd.image_resizer.fixed_shape_resizer.height
+  width = model_config.ssd.image_resizer.fixed_shape_resizer.width
+  # If data augmentation is specified in the config file, the preprocessor
+  # will be called here to augment the data as specified. Most common
+  # augmentations include horizontal flip and cropping.
+  if data_augmentation_options:
+    images_pre = tf.split(tensor_dict['image'], config.video_length, axis=0)
+    bboxes_pre = tf.split(
+        tensor_dict['groundtruth_boxes'], config.video_length, axis=0)
+    labels_pre = tf.split(
+        tensor_dict['groundtruth_classes'], config.video_length, axis=0)
+    images_proc, bboxes_proc, labels_proc = [], [], []
+    cache = preprocessor_cache.PreprocessorCache()
+    for i, _ in enumerate(images_pre):
+      image_dict = {
+          fields.InputDataFields.image:
+              images_pre[i],
+          fields.InputDataFields.groundtruth_boxes:
+              tf.squeeze(bboxes_pre[i], axis=0),
+          fields.InputDataFields.groundtruth_classes:
+              tf.squeeze(labels_pre[i], axis=0),
+      }
+      image_dict = preprocessor.preprocess(
+          image_dict,
+          data_augmentation_options,
+          func_arg_map=preprocessor.get_default_func_arg_map(),
+          preprocess_vars_cache=cache)
+      # Pads detection count to _PADDING_SIZE.
+      image_dict[fields.InputDataFields.groundtruth_boxes] = tf.pad(
+          image_dict[fields.InputDataFields.groundtruth_boxes],
+          [[0, _PADDING_SIZE], [0, 0]])
+      image_dict[fields.InputDataFields.groundtruth_boxes] = tf.slice(
+          image_dict[fields.InputDataFields.groundtruth_boxes], [0, 0],
+          [_PADDING_SIZE, -1])
+      image_dict[fields.InputDataFields.groundtruth_classes] = tf.pad(
+          image_dict[fields.InputDataFields.groundtruth_classes],
+          [[0, _PADDING_SIZE]])
+      image_dict[fields.InputDataFields.groundtruth_classes] = tf.slice(
+          image_dict[fields.InputDataFields.groundtruth_classes], [0],
+          [_PADDING_SIZE])
+      images_proc.append(image_dict[fields.InputDataFields.image])
+      bboxes_proc.append(image_dict[fields.InputDataFields.groundtruth_boxes])
+      labels_proc.append(image_dict[fields.InputDataFields.groundtruth_classes])
+    tensor_dict['image'] = tf.concat(images_proc, axis=0)
+    tensor_dict['groundtruth_boxes'] = tf.stack(bboxes_proc, axis=0)
+    tensor_dict['groundtruth_classes'] = tf.stack(labels_proc, axis=0)
+  else:
+    # Pads detection count to _PADDING_SIZE per frame.
+    tensor_dict['groundtruth_boxes'] = tf.pad(
+        tensor_dict['groundtruth_boxes'], [[0, 0], [0, _PADDING_SIZE], [0, 0]])
+    tensor_dict['groundtruth_boxes'] = tf.slice(
+        tensor_dict['groundtruth_boxes'], [0, 0, 0], [-1, _PADDING_SIZE, -1])
+    tensor_dict['groundtruth_classes'] = tf.pad(
+        tensor_dict['groundtruth_classes'], [[0, 0], [0, _PADDING_SIZE]])
+    tensor_dict['groundtruth_classes'] = tf.slice(
+        tensor_dict['groundtruth_classes'], [0, 0], [-1, _PADDING_SIZE])
+  tensor_dict['image'], _ = preprocessor.resize_image(
+      tensor_dict['image'], new_height=height, new_width=width)
+  num_steps = config.video_length / unroll_length
+  init_states = {
+      'lstm_state_c':
+          tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
+      'lstm_state_h':
+          tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
+      'lstm_state_step':
+          tf.constant(num_steps, shape=[]),
+  }
+  batch = sqss.batch_sequences_with_states(
+      input_key=key,
+      input_sequences=tensor_dict,
+      input_context={},
+      input_length=None,
+      initial_states=init_states,
+      num_unroll=unroll_length,
+      batch_size=batch_size,
+      num_threads=batch_size,
+      make_keys_unique=True,
+      capacity=batch_size * batch_size)
+  return _build_training_batch_dict(batch, unroll_length, batch_size)