Merge remote-tracking branch 'upstream/master'

7c732da7 · Nimit Nigania · cb8ce606 · e36934b3 · 7c732da7 · 7c732da7
Commit 7c732da7 authored Aug 12, 2019 by Nimit Nigania
20 changed files
--- a/official/bert/benchmark/bert_squad_benchmark.py
+++ b/official/bert/benchmark/bert_squad_benchmark.py
@@ -152,7 +152,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 1
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad')
-    FLAGS.train_batch_size = 4
+    FLAGS.train_batch_size = 3
    self._run_and_report_benchmark()
@@ -174,7 +174,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 1
    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat_squad')
-    FLAGS.train_batch_size = 4
+    FLAGS.train_batch_size = 3
    self._run_and_report_benchmark(use_ds=False)
@@ -185,7 +185,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self.num_gpus = 1
    FLAGS.model_dir = self._get_model_dir(
        'benchmark_1_gpu_eager_no_dist_strat_squad')
-    FLAGS.train_batch_size = 4
+    FLAGS.train_batch_size = 3
    self._run_and_report_benchmark(use_ds=False, run_eagerly=True)
@@ -195,7 +195,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 2
    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu_squad')
-    FLAGS.train_batch_size = 8
+    FLAGS.train_batch_size = 6
    self._run_and_report_benchmark()
@@ -205,7 +205,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 4
    FLAGS.model_dir = self._get_model_dir('benchmark_4_gpu_squad')
-    FLAGS.train_batch_size = 16
+    FLAGS.train_batch_size = 12
    self._run_and_report_benchmark()
@@ -215,7 +215,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 8
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad')
-    FLAGS.train_batch_size = 32
+    FLAGS.train_batch_size = 24
    self._run_and_report_benchmark()
@@ -231,6 +231,19 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
    self._run_and_report_benchmark()
+  def benchmark_1_gpu_xla_fp16(self):
+    """Tests BERT SQuAD model performance with 1 GPU with XLA and FP16."""
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla_squad_fp16')
+    FLAGS.train_batch_size = 4
+    FLAGS.enable_xla = True
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 'dynamic'
+    self._run_and_report_benchmark()
  def benchmark_2_gpu_fp16(self):
    """Tests BERT SQuAD model performance with 2 GPUs and FP16."""
@@ -324,7 +337,7 @@ class BertSquadAccuracy(BertSquadBenchmarkBase):
    self._setup()
    self.num_gpus = 8
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad')
-    FLAGS.train_batch_size = 32
+    FLAGS.train_batch_size = 24
    self._run_and_report_benchmark()

--- a/official/bert/modeling.py
+++ b/official/bert/modeling.py
@@ -276,6 +276,7 @@ class EmbeddingPostprocessor(tf.keras.layers.Layer):
               max_position_embeddings=512,
               dropout_prob=0.0,
               initializer_range=0.02,
+               initializer=None,
               **kwargs):
    super(EmbeddingPostprocessor, self).__init__(**kwargs)
    self.use_type_embeddings = use_type_embeddings
@@ -285,6 +286,11 @@ class EmbeddingPostprocessor(tf.keras.layers.Layer):
    self.dropout_prob = dropout_prob
    self.initializer_range = initializer_range
+    if not initializer:
+      self.initializer = get_initializer(self.initializer_range)
+    else:
+      self.initializer = initializer
    if self.use_type_embeddings and not self.token_type_vocab_size:
      raise ValueError("If `use_type_embeddings` is True, then "
                       "`token_type_vocab_size` must be specified.")
@@ -723,6 +729,15 @@ class TransformerBlock(tf.keras.layers.Layer):
        name="output_layer_norm", axis=-1, epsilon=1e-12)
    super(TransformerBlock, self).build(unused_input_shapes)
+  def common_layers(self):
+    """Explicitly gets all layer objects inside a Transformer encoder block."""
+    return [
+        self.attention_layer, self.attention_output_dense,
+        self.attention_dropout, self.attention_layer_norm,
+        self.intermediate_dense, self.output_dense, self.output_dropout,
+        self.output_layer_norm
+    ]
  def __call__(self, input_tensor, attention_mask=None):
    inputs = pack_inputs([input_tensor, attention_mask])
    return super(TransformerBlock, self).__call__(inputs)

--- a/official/bert/run_classifier.py
+++ b/official/bert/run_classifier.py
@@ -35,8 +35,8 @@ from official.bert import model_saving_utils
 from official.bert import model_training_utils
 from official.bert import modeling
 from official.bert import optimization
-from official.bert import tpu_lib
 from official.utils.misc import keras_utils
+from official.utils.misc import tpu_lib
 flags.DEFINE_enum(
    'mode', 'train_and_eval', ['train_and_eval', 'export_only'],

--- a/official/bert/run_pretraining.py
+++ b/official/bert/run_pretraining.py
@@ -33,7 +33,7 @@ from official.bert import model_saving_utils
 from official.bert import model_training_utils
 from official.bert import modeling
 from official.bert import optimization
-from official.bert import tpu_lib
+from official.utils.misc import tpu_lib
 flags.DEFINE_string('input_files', None,
                    'File path to retrieve training data for pre-training.')

--- a/official/bert/run_squad.py
+++ b/official/bert/run_squad.py
@@ -36,8 +36,8 @@ from official.bert import modeling
 from official.bert import optimization
 from official.bert import squad_lib
 from official.bert import tokenization
-from official.bert import tpu_lib
 from official.utils.misc import keras_utils
+from official.utils.misc import tpu_lib
 flags.DEFINE_bool('do_train', False, 'Whether to run training.')
 flags.DEFINE_bool('do_predict', False, 'Whether to run eval on the dev set.')

--- a/official/recommendation/data_pipeline.py
+++ b/official/recommendation/data_pipeline.py
@@ -143,37 +143,32 @@ class DatasetManager(object):
      if is_training:
        return {
            movielens.USER_COLUMN:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            movielens.ITEM_COLUMN:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            rconst.VALID_POINT_MASK:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            "labels":
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64)
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
        }
      else:
        return {
            movielens.USER_COLUMN:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            movielens.ITEM_COLUMN:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
            rconst.DUPLICATE_MASK:
-                tf.io.FixedLenFeature([batch_size], dtype=tf.int64)
+                tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
        }
    features = tf.io.parse_single_example(
        serialized_data, _get_feature_map(batch_size, is_training=is_training))
-    users = tf.reshape(
+    users = tf.cast(features[movielens.USER_COLUMN], rconst.USER_DTYPE)
-        tf.cast(features[movielens.USER_COLUMN], rconst.USER_DTYPE),
+    items = tf.cast(features[movielens.ITEM_COLUMN], rconst.ITEM_DTYPE)
-        (batch_size,))
-    items = tf.reshape(
-        tf.cast(features[movielens.ITEM_COLUMN], rconst.ITEM_DTYPE),
-        (batch_size,))
    if is_training:
-      valid_point_mask = tf.reshape(
+      valid_point_mask = tf.cast(features[rconst.VALID_POINT_MASK], tf.bool)
-          tf.cast(features[movielens.ITEM_COLUMN], tf.bool), (batch_size,))
+      fake_dup_mask = tf.zeros_like(users)
-      fake_dup_mask = tf.zeros_like(features[movielens.USER_COLUMN])
      return {
          movielens.USER_COLUMN: users,
          movielens.ITEM_COLUMN: items,
@@ -184,20 +179,15 @@ class DatasetManager(object):
          rconst.DUPLICATE_MASK: fake_dup_mask
      }
    else:
-      labels = tf.reshape(
+      labels = tf.cast(tf.zeros_like(users), tf.bool)
-          tf.cast(tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool),
+      fake_valid_pt_mask = tf.cast(tf.zeros_like(users), tf.bool)
-          (batch_size, 1))
-      fake_valid_pt_mask = tf.cast(
-          tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
      return {
          movielens.USER_COLUMN:
              users,
          movielens.ITEM_COLUMN:
              items,
          rconst.DUPLICATE_MASK:
-              tf.reshape(
+              tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
-                  tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
-                  (batch_size,)),
          rconst.VALID_POINT_MASK:
              fake_valid_pt_mask,
          rconst.TRAIN_LABEL_KEY:
@@ -221,8 +211,8 @@ class DatasetManager(object):
    if self._is_training:
      mask_start_index = data.pop(rconst.MASK_START_INDEX)
      batch_size = data[movielens.ITEM_COLUMN].shape[0]
-      data[rconst.VALID_POINT_MASK] = np.less(
+      data[rconst.VALID_POINT_MASK] = np.expand_dims(
-          np.arange(batch_size), mask_start_index)
+          np.less(np.arange(batch_size), mask_start_index), -1)
    if self._stream_files:
      example_bytes = self.serialize(data)
@@ -313,19 +303,21 @@ class DatasetManager(object):
    else:
      types = {movielens.USER_COLUMN: rconst.USER_DTYPE,
               movielens.ITEM_COLUMN: rconst.ITEM_DTYPE}
-      shapes = {movielens.USER_COLUMN: tf.TensorShape([batch_size]),
+      shapes = {
-                movielens.ITEM_COLUMN: tf.TensorShape([batch_size])}
+          movielens.USER_COLUMN: tf.TensorShape([batch_size, 1]),
+          movielens.ITEM_COLUMN: tf.TensorShape([batch_size, 1])
+      }
      if self._is_training:
        types[rconst.VALID_POINT_MASK] = np.bool
-        shapes[rconst.VALID_POINT_MASK] = tf.TensorShape([batch_size])
+        shapes[rconst.VALID_POINT_MASK] = tf.TensorShape([batch_size, 1])
        types = (types, np.bool)
-        shapes = (shapes, tf.TensorShape([batch_size]))
+        shapes = (shapes, tf.TensorShape([batch_size, 1]))
      else:
        types[rconst.DUPLICATE_MASK] = np.bool
-        shapes[rconst.DUPLICATE_MASK] = tf.TensorShape([batch_size])
+        shapes[rconst.DUPLICATE_MASK] = tf.TensorShape([batch_size, 1])
      data_generator = functools.partial(
          self.data_generator, epochs_between_evals=epochs_between_evals)
@@ -554,12 +546,17 @@ class BaseDataConstructor(threading.Thread):
      items = np.concatenate([items, item_pad])
      labels = np.concatenate([labels, label_pad])
-    self._train_dataset.put(i, {
+    self._train_dataset.put(
-        movielens.USER_COLUMN: users,
+        i, {
-        movielens.ITEM_COLUMN: items,
+            movielens.USER_COLUMN:
-        rconst.MASK_START_INDEX: np.array(mask_start_index, dtype=np.int32),
+                np.reshape(users, (self.train_batch_size, 1)),
-        "labels": labels,
+            movielens.ITEM_COLUMN:
-    })
+                np.reshape(items, (self.train_batch_size, 1)),
+            rconst.MASK_START_INDEX:
+                np.array(mask_start_index, dtype=np.int32),
+            "labels":
+                np.reshape(labels, (self.train_batch_size, 1)),
+        })
  def _wait_to_construct_train_epoch(self):
    count = 0
@@ -649,11 +646,15 @@ class BaseDataConstructor(threading.Thread):
    users, items, duplicate_mask = self._assemble_eval_batch(
        users, positive_items, negative_items, self._eval_users_per_batch)
-    self._eval_dataset.put(i, {
+    self._eval_dataset.put(
-        movielens.USER_COLUMN: users.flatten(),
+        i, {
-        movielens.ITEM_COLUMN: items.flatten(),
+            movielens.USER_COLUMN:
-        rconst.DUPLICATE_MASK: duplicate_mask.flatten(),
+                np.reshape(users.flatten(), (self.eval_batch_size, 1)),
-    })
+            movielens.ITEM_COLUMN:
+                np.reshape(items.flatten(), (self.eval_batch_size, 1)),
+            rconst.DUPLICATE_MASK:
+                np.reshape(duplicate_mask.flatten(), (self.eval_batch_size, 1)),
+        })
  def _construct_eval_epoch(self):
    """Loop to construct data for evaluation."""
@@ -720,24 +721,37 @@ class DummyConstructor(threading.Thread):
      num_users = params["num_users"]
      num_items = params["num_items"]
-      users = tf.random.uniform([batch_size], dtype=tf.int32, minval=0,
+      users = tf.random.uniform([batch_size, 1],
+                                dtype=tf.int32,
+                                minval=0,
                                maxval=num_users)
-      items = tf.random.uniform([batch_size], dtype=tf.int32, minval=0,
+      items = tf.random.uniform([batch_size, 1],
+                                dtype=tf.int32,
+                                minval=0,
                                maxval=num_items)
      if is_training:
-        valid_point_mask = tf.cast(tf.random.uniform(
+        valid_point_mask = tf.cast(
-            [batch_size], dtype=tf.int32, minval=0, maxval=2), tf.bool)
+            tf.random.uniform([batch_size, 1],
-        labels = tf.cast(tf.random.uniform(
+                              dtype=tf.int32,
-            [batch_size], dtype=tf.int32, minval=0, maxval=2), tf.bool)
+                              minval=0,
+                              maxval=2), tf.bool)
+        labels = tf.cast(
+            tf.random.uniform([batch_size, 1],
+                              dtype=tf.int32,
+                              minval=0,
+                              maxval=2), tf.bool)
        data = {
            movielens.USER_COLUMN: users,
            movielens.ITEM_COLUMN: items,
            rconst.VALID_POINT_MASK: valid_point_mask,
        }, labels
      else:
-        dupe_mask = tf.cast(tf.random.uniform([batch_size], dtype=tf.int32,
+        dupe_mask = tf.cast(
-                                              minval=0, maxval=2), tf.bool)
+            tf.random.uniform([batch_size, 1],
+                              dtype=tf.int32,
+                              minval=0,
+                              maxval=2), tf.bool)
        data = {
            movielens.USER_COLUMN: users,
            movielens.ITEM_COLUMN: items,

--- a/official/recommendation/data_test.py
+++ b/official/recommendation/data_test.py
@@ -168,8 +168,11 @@ class BaseTest(tf.test.TestCase):
    md5 = hashlib.md5()
    for features, labels in first_epoch:
      data_list = [
-          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
+          features[movielens.USER_COLUMN].flatten(),
-          features[rconst.VALID_POINT_MASK], labels]
+          features[movielens.ITEM_COLUMN].flatten(),
+          features[rconst.VALID_POINT_MASK].flatten(),
+          labels.flatten()
+      ]
      for i in data_list:
        md5.update(i.tobytes())
@@ -216,8 +219,10 @@ class BaseTest(tf.test.TestCase):
    md5 = hashlib.md5()
    for features in eval_data:
      data_list = [
-          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
+          features[movielens.USER_COLUMN].flatten(),
-          features[rconst.DUPLICATE_MASK]]
+          features[movielens.ITEM_COLUMN].flatten(),
+          features[rconst.DUPLICATE_MASK].flatten()
+      ]
      for i in data_list:
        md5.update(i.tobytes())
@@ -276,8 +281,11 @@ class BaseTest(tf.test.TestCase):
    md5 = hashlib.md5()
    for features, labels in results:
      data_list = [
-          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
+          features[movielens.USER_COLUMN].flatten(),
-          features[rconst.VALID_POINT_MASK], labels]
+          features[movielens.ITEM_COLUMN].flatten(),
+          features[rconst.VALID_POINT_MASK].flatten(),
+          labels.flatten()
+      ]
      for i in data_list:
        md5.update(i.tobytes())

--- a/official/recommendation/ncf_common.py
+++ b/official/recommendation/ncf_common.py
@@ -37,7 +37,6 @@ from official.utils.flags import core as flags_core
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
 FLAGS = flags.FLAGS
@@ -60,13 +59,8 @@ def get_inputs(params):
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)
+    num_train_steps = producer.train_batches_per_epoch
-    num_train_steps = (producer.train_batches_per_epoch //
+    num_eval_steps = producer.eval_batches_per_epoch
-                       params["batches_per_step"])
-    num_eval_steps = (producer.eval_batches_per_epoch //
-                      params["batches_per_step"])
-    assert not producer.train_batches_per_epoch % params["batches_per_step"]
-    assert not producer.eval_batches_per_epoch % params["batches_per_step"]
  return num_users, num_items, num_train_steps, num_eval_steps, producer
@@ -74,18 +68,13 @@ def get_inputs(params):
 def parse_flags(flags_obj):
  """Convenience function to turn flags into params."""
  num_gpus = flags_core.get_num_gpus(flags_obj)
-  num_devices = FLAGS.num_tpu_shards if FLAGS.tpu else num_gpus or 1
-  batch_size = (flags_obj.batch_size + num_devices - 1) // num_devices
-  eval_divisor = (rconst.NUM_EVAL_NEGATIVES + 1) * num_devices
+  batch_size = flags_obj.batch_size
  eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
-  eval_batch_size = ((eval_batch_size + eval_divisor - 1) //
-                     eval_divisor * eval_divisor // num_devices)
  return {
      "train_epochs": flags_obj.train_epochs,
-      "batches_per_step": num_devices,
+      "batches_per_step": 1,
      "use_seed": flags_obj.seed is not None,
      "batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
@@ -95,6 +84,7 @@ def parse_flags(flags_obj):
      "mf_regularization": flags_obj.mf_regularization,
      "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
      "num_neg": flags_obj.num_neg,
+      "distribution_strategy": flags_obj.distribution_strategy,
      "num_gpus": num_gpus,
      "use_tpu": flags_obj.tpu is not None,
      "tpu": flags_obj.tpu,
@@ -115,7 +105,7 @@ def parse_flags(flags_obj):
  }
-def get_distribution_strategy(params):
+def get_v1_distribution_strategy(params):
  """Returns the distribution strategy to use."""
  if params["use_tpu"]:
    # Some of the networking libraries are quite chatty.

--- a/official/recommendation/ncf_estimator_main.py
+++ b/official/recommendation/ncf_estimator_main.py
@@ -66,7 +66,7 @@ def construct_estimator(model_dir, params):
  Returns:
    An Estimator or TPUEstimator.
  """
-  distribution = ncf_common.get_distribution_strategy(params)
+  distribution = ncf_common.get_v1_distribution_strategy(params)
  run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                      eval_distribute=distribution)

--- a/official/recommendation/ncf_input_pipeline.py
+++ b/official/recommendation/ncf_input_pipeline.py
@@ -82,7 +82,6 @@ def create_dataset_from_data_producer(producer, params):
    Returns:
      Processed training features.
    """
-    labels = tf.expand_dims(labels, -1)
    fake_dup_mask = tf.zeros_like(features[movielens.USER_COLUMN])
    features[rconst.DUPLICATE_MASK] = fake_dup_mask
    features[rconst.TRAIN_LABEL_KEY] = labels
@@ -106,7 +105,6 @@ def create_dataset_from_data_producer(producer, params):
      Processed evaluation features.
    """
    labels = tf.cast(tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
-    labels = tf.expand_dims(labels, -1)
    fake_valid_pt_mask = tf.cast(
        tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
    features[rconst.VALID_POINT_MASK] = fake_valid_pt_mask
@@ -134,9 +132,13 @@ def create_ncf_input_data(params, producer=None, input_meta_data=None):
  Returns:
    (training dataset, evaluation dataset, train steps per epoch,
    eval steps per epoch)
-  """
+  Raises:
+    ValueError: If data is being generated online for when using TPU's.
+  """
  if params["train_dataset_path"]:
+    assert params["eval_dataset_path"]
    train_dataset = create_dataset_from_tf_record_files(
        params["train_dataset_path"],
        input_meta_data["train_prebatch_size"],
@@ -148,34 +150,18 @@ def create_ncf_input_data(params, producer=None, input_meta_data=None):
        params["eval_batch_size"],
        is_training=False)
-    # TODO(b/259377621): Remove number of devices (i.e.
+    num_train_steps = int(input_meta_data["num_train_steps"])
-    # params["batches_per_step"]) in input pipeline logic and only use
+    num_eval_steps = int(input_meta_data["num_eval_steps"])
-    # global batch size instead.
-    num_train_steps = int(
-        np.ceil(input_meta_data["num_train_steps"] /
-                params["batches_per_step"]))
-    num_eval_steps = (
-        input_meta_data["num_eval_steps"] // params["batches_per_step"])
  else:
-    assert producer
+    if params["use_tpu"]:
+      raise ValueError("TPU training does not support data producer yet. "
+                       "Use pre-processed data.")
+    assert producer
    # Start retrieving data from producer.
    train_dataset, eval_dataset = create_dataset_from_data_producer(
        producer, params)
-    num_train_steps = (
+    num_train_steps = producer.train_batches_per_epoch
-        producer.train_batches_per_epoch // params["batches_per_step"])
+    num_eval_steps = producer.eval_batches_per_epoch
-    num_eval_steps = (
-        producer.eval_batches_per_epoch // params["batches_per_step"])
-    assert not producer.train_batches_per_epoch % params["batches_per_step"]
-    assert not producer.eval_batches_per_epoch % params["batches_per_step"]
-  # It is required that for distributed training, the dataset must call
-  # batch(). The parameter of batch() here is the number of replicas involed,
-  # such that each replica evenly gets a slice of data.
-  # drop_remainder = True, as we would like batch call to return a fixed shape
-  # vs None, this prevents a expensive broadcast during weighted_loss
-  batches_per_step = params["batches_per_step"]
-  train_dataset = train_dataset.batch(batches_per_step, drop_remainder=True)
-  eval_dataset = eval_dataset.batch(batches_per_step, drop_remainder=True)
  return train_dataset, eval_dataset, num_train_steps, num_eval_steps
--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
--- a/official/recommendation/ncf_test.py
+++ b/official/recommendation/ncf_test.py
@@ -189,7 +189,7 @@ class NcfTest(tf.test.TestCase):
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) +
                                  2 * math.log(2) / math.log(4)) / 4)
-  _BASE_END_TO_END_FLAGS = ['-batch_size', '1024', '-train_epochs', '1']
+  _BASE_END_TO_END_FLAGS = ['-batch_size', '1044', '-train_epochs', '1']
  @unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)

--- a/official/recommendation/neumf_model.py
+++ b/official/recommendation/neumf_model.py
@@ -109,7 +109,6 @@ def neumf_model_fn(features, labels, mode, params):
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
                            value=params["epsilon"])
    optimizer = tf.compat.v1.train.AdamOptimizer(
        learning_rate=params["learning_rate"],
        beta1=params["beta1"],
@@ -151,7 +150,7 @@ def _strip_first_and_last_dimension(x, batch_size):
  return tf.reshape(x[0, :], (batch_size,))
-def construct_model(user_input, item_input, params, need_strip=False):
+def construct_model(user_input, item_input, params):
  # type: (tf.Tensor, tf.Tensor, dict) -> tf.keras.Model
  """Initialize NeuMF model.
@@ -184,34 +183,33 @@ def construct_model(user_input, item_input, params, need_strip=False):
  # Initializer for embedding layers
  embedding_initializer = "glorot_uniform"
-  if need_strip:
+  def mf_slice_fn(x):
-    batch_size = params["batch_size"]
+    x = tf.squeeze(x, [1])
+    return x[:, :mf_dim]
-    user_input_reshaped = tf.keras.layers.Lambda(
-        lambda x: _strip_first_and_last_dimension(
-            x, batch_size))(user_input)
-    item_input_reshaped = tf.keras.layers.Lambda(
+  def mlp_slice_fn(x):
-        lambda x: _strip_first_and_last_dimension(
+    x = tf.squeeze(x, [1])
-            x, batch_size))(item_input)
+    return x[:, mf_dim:]
  # It turns out to be significantly more effecient to store the MF and MLP
  # embedding portions in the same table, and then slice as needed.
-  mf_slice_fn = lambda x: x[:, :mf_dim]
-  mlp_slice_fn = lambda x: x[:, mf_dim:]
  embedding_user = tf.keras.layers.Embedding(
-      num_users, mf_dim + model_layers[0] // 2,
+      num_users,
+      mf_dim + model_layers[0] // 2,
      embeddings_initializer=embedding_initializer,
      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-      input_length=1, name="embedding_user")(
+      input_length=1,
-          user_input_reshaped if need_strip else user_input)
+      name="embedding_user")(
+          user_input)
  embedding_item = tf.keras.layers.Embedding(
-      num_items, mf_dim + model_layers[0] // 2,
+      num_items,
+      mf_dim + model_layers[0] // 2,
      embeddings_initializer=embedding_initializer,
      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-      input_length=1, name="embedding_item")(
+      input_length=1,
-          item_input_reshaped if need_strip else item_input)
+      name="embedding_item")(
+          item_input)
  # GMF part
  mf_user_latent = tf.keras.layers.Lambda(

--- a/official/utils/misc/distribution_utils.py
+++ b/official/utils/misc/distribution_utils.py
@@ -24,6 +24,8 @@ import random
 import string
 import tensorflow as tf
+from official.utils.misc import tpu_lib
 def _collective_communication(all_reduce_alg):
  """Return a CollectiveCommunication based on all_reduce_alg.
@@ -83,16 +85,18 @@ def get_distribution_strategy(distribution_strategy="default",
                              num_gpus=0,
                              num_workers=1,
                              all_reduce_alg=None,
-                              num_packs=1):
+                              num_packs=1,
+                              tpu_address=None):
  """Return a DistributionStrategy for running the model.
  Args:
    distribution_strategy: a string specifying which distribution strategy to
      use. Accepted values are 'off', 'default', 'one_device', 'mirrored',
-      'parameter_server', 'multi_worker_mirrored', case insensitive. 'off' means
+      'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive.
-      not to use Distribution Strategy; 'default' means to choose from
+      'off' means not to use Distribution Strategy; 'default' means to choose from
      `MirroredStrategy`, `MultiWorkerMirroredStrategy`, or `OneDeviceStrategy`
-      according to the number of GPUs and number of workers.
+      according to the number of GPUs and number of workers. 'tpu' means to use
+      TPUStrategy using `tpu_address`.
    num_gpus: Number of GPUs to run this model.
    num_workers: Number of workers to run this model.
    all_reduce_alg: Optional. Specifies which algorithm to use when performing
@@ -102,12 +106,14 @@ def get_distribution_strategy(distribution_strategy="default",
      device topology.
    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
+    tpu_address: Optional. String that represents TPU to connect to. Must not
+      be None if `distribution_strategy` is set to `tpu`.
  Returns:
    tf.distribute.DistibutionStrategy object.
  Raises:
    ValueError: if `distribution_strategy` is 'off' or 'one_device' and
-      `num_gpus` is larger than 1; or `num_gpus` is negative.
+      `num_gpus` is larger than 1; or `num_gpus` is negative or if
+      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
  """
  if num_gpus < 0:
    raise ValueError("`num_gpus` can not be negative.")
@@ -120,6 +126,15 @@ def get_distribution_strategy(distribution_strategy="default",
          "flag cannot be set to 'off'.".format(num_gpus, num_workers))
    return None
+  if distribution_strategy == "tpu":
+    if not tpu_address:
+      raise ValueError("`tpu_address` must be specified when using "
+                       "TPUStrategy.")
+    # Initialize TPU System.
+    cluster_resolver = tpu_lib.tpu_initialize(tpu_address)
+    return tf.distribute.experimental.TPUStrategy(cluster_resolver)
  if distribution_strategy == "multi_worker_mirrored":
    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
        communication=_collective_communication(all_reduce_alg))

--- a/official/bert/tpu_lib.py
+++ b/official/bert/tpu_lib.py
@@ -31,3 +31,8 @@ def tpu_initialize(tpu_address):
  tf.config.experimental_connect_to_host(cluster_resolver.master())
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  return cluster_resolver
+def get_primary_cpu_task(use_remote_tpu=False):
+  """Returns remote TPU worker address. No-op for GPU/CPU training."""
+  return "/job:worker" if use_remote_tpu else ""
--- a/research/lstm_object_detection/README.md
+++ b/research/lstm_object_detection/README.md
@@ -32,3 +32,8 @@ https://scholar.googleusercontent.com/scholar.bib?q=info:rLqvkztmWYgJ:scholar.go
 * yinxiao@google.com
 * menglong@google.com
 * yongzhe@google.com
+## Table of Contents
+  * <a href='g3doc/exporting_models.md'>Exporting a trained model</a>
--- a/research/lstm_object_detection/configs/lstm_ssd_interleaved_mobilenet_v2_imagenet.config
+++ b/research/lstm_object_detection/configs/lstm_ssd_interleaved_mobilenet_v2_imagenet.config
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# For training on Imagenet Video with LSTM Interleaved Mobilenet V2
+[lstm_object_detection.protos.lstm_model] {
+  train_unroll_length: 4
+  eval_unroll_length: 4
+  lstm_state_depth: 320
+  depth_multipliers: 1.4
+  depth_multipliers: 0.35
+  pre_bottleneck: true
+  low_res: true
+  train_interleave_method: 'RANDOM_SKIP_SMALL'
+  eval_interleave_method: 'SKIP3'
+}
+model {
+  ssd {
+    num_classes: 30  # Num of class for imagenet vid dataset.
+    box_coder {
+      faster_rcnn_box_coder {
+        y_scale: 10.0
+        x_scale: 10.0
+        height_scale: 5.0
+        width_scale: 5.0
+      }
+    }
+    matcher {
+      argmax_matcher {
+        matched_threshold: 0.5
+        unmatched_threshold: 0.5
+        ignore_thresholds: false
+        negatives_lower_than_unmatched: true
+        force_match_for_each_row: true
+      }
+    }
+    similarity_calculator {
+      iou_similarity {
+      }
+    }
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 5
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 3.0
+        aspect_ratios: 0.3333
+      }
+    }
+    image_resizer {
+      fixed_shape_resizer {
+        height: 320
+        width: 320
+      }
+    }
+    box_predictor {
+      convolutional_box_predictor {
+        min_depth: 0
+        max_depth: 0
+        num_layers_before_predictor: 3
+        use_dropout: false
+        dropout_keep_probability: 0.8
+        kernel_size: 3
+        box_code_size: 4
+        apply_sigmoid_to_scores: false
+        use_depthwise: true
+        conv_hyperparams {
+          activation: RELU_6,
+          regularizer {
+            l2_regularizer {
+              weight: 0.00004
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+              stddev: 0.03
+              mean: 0.0
+            }
+          }
+          batch_norm {
+            train: true,
+            scale: true,
+            center: true,
+            decay: 0.9997,
+            epsilon: 0.001,
+          }
+        }
+      }
+    }
+    feature_extractor {
+      type: 'lstm_ssd_interleaved_mobilenet_v2'
+      conv_hyperparams {
+        activation: RELU_6,
+        regularizer {
+          l2_regularizer {
+            weight: 0.00004
+          }
+        }
+        initializer {
+          truncated_normal_initializer {
+            stddev: 0.03
+            mean: 0.0
+          }
+        }
+        batch_norm {
+          train: true,
+          scale: true,
+          center: true,
+          decay: 0.9997,
+          epsilon: 0.001,
+        }
+      }
+    }
+    loss {
+      classification_loss {
+        weighted_sigmoid {
+        }
+      }
+      localization_loss {
+        weighted_smooth_l1 {
+        }
+      }
+      hard_example_miner {
+        num_hard_examples: 3000
+        iou_threshold: 0.99
+        loss_type: CLASSIFICATION
+        max_negatives_per_positive: 3
+        min_negatives_per_image: 0
+      }
+      classification_weight: 1.0
+      localization_weight: 4.0
+    }
+    normalize_loss_by_num_matches: true
+    post_processing {
+      batch_non_max_suppression {
+        score_threshold: -20.0
+        iou_threshold: 0.5
+        max_detections_per_class: 100
+        max_total_detections: 100
+      }
+      score_converter: SIGMOID
+    }
+  }
+}
+train_config: {
+  batch_size: 8
+  optimizer {
+    use_moving_average: false
+    rms_prop_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.002
+          decay_steps: 200000
+          decay_factor: 0.95
+        }
+      }
+      momentum_optimizer_value: 0.9
+      decay: 0.9
+      epsilon: 1.0
+    }
+  }
+  gradient_clipping_by_norm: 10.0
+  batch_queue_capacity: 12
+  prefetch_queue_capacity: 4
+}
+train_input_reader: {
+  shuffle_buffer_size: 32
+  queue_capacity: 12
+  prefetch_size: 12
+  min_after_dequeue: 4
+  label_map_path: "path/to/label_map"
+  external_input_reader {
+    [lstm_object_detection.protos.GoogleInputReader.google_input_reader] {
+      tf_record_video_input_reader: {
+        input_path: '/data/lstm_detection/tfrecords/test.tfrecord'
+        data_type: TF_SEQUENCE_EXAMPLE
+        video_length: 4
+      }
+    }
+  }
+}
+eval_config: {
+  metrics_set: "coco_evaluation_all_frames"
+  use_moving_averages: true
+  min_score_threshold: 0.5
+  max_num_boxes_to_visualize: 300
+  visualize_groundtruth_boxes: true
+  groundtruth_box_visualization_color: "red"
+}
+eval_input_reader {
+  label_map_path: "path/to/label_map"
+  shuffle: true
+  num_epochs: 1
+  num_parallel_batches: 1
+  num_readers: 1
+  external_input_reader {
+    [lstm_object_detection.protos.GoogleInputReader.google_input_reader] {
+      tf_record_video_input_reader: {
+        input_path: "path/to/sequence_example/data"
+        data_type: TF_SEQUENCE_EXAMPLE
+        video_length: 10
+      }
+    }
+  }
+}
+eval_input_reader: {
+  label_map_path: "path/to/label_map"
+  external_input_reader {
+    [lstm_object_detection.protos.GoogleInputReader.google_input_reader] {
+      tf_record_video_input_reader: {
+        input_path: "path/to/sequence_example/data"
+        data_type: TF_SEQUENCE_EXAMPLE
+        video_length: 4
+      }
+    }
+  }
+  shuffle: true
+  num_readers: 1
+}
--- a/research/lstm_object_detection/eval.py
+++ b/research/lstm_object_detection/eval.py
@@ -27,8 +27,6 @@ import functools
 import os
 import tensorflow as tf
 from google.protobuf import text_format
-from google3.pyglib import app
-from google3.pyglib import flags
 from lstm_object_detection import evaluator
 from lstm_object_detection import model_builder
 from lstm_object_detection.inputs import seq_dataset_builder
@@ -107,4 +105,4 @@ def main(unused_argv):
                     FLAGS.checkpoint_dir, FLAGS.eval_dir)
 if __name__ == '__main__':
-  app.run()
+  tf.app.run()
--- a/research/lstm_object_detection/export_tflite_lstd_graph.py
+++ b/research/lstm_object_detection/export_tflite_lstd_graph.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Exports an LSTM detection model to use with tf-lite.
+Outputs file:
+* A tflite compatible frozen graph - $output_directory/tflite_graph.pb
+The exported graph has the following input and output nodes.
+Inputs:
+'input_video_tensor': a float32 tensor of shape
+[unroll_length, height, width, 3] containing the normalized input image.
+Note that the height and width must be compatible with the height and
+width configured in the fixed_shape_image resizer options in the pipeline
+config proto.
+Outputs:
+If add_postprocessing_op is true: frozen graph adds a
+  TFLite_Detection_PostProcess custom op node has four outputs:
+  detection_boxes: a float32 tensor of shape [1, num_boxes, 4] with box
+  locations
+  detection_classes: a float32 tensor of shape [1, num_boxes]
+  with class indices
+  detection_scores: a float32 tensor of shape [1, num_boxes]
+  with class scores
+  num_boxes: a float32 tensor of size 1 containing the number of detected boxes
+else:
+  the graph has three outputs:
+   'raw_outputs/box_encodings': a float32 tensor of shape [1, num_anchors, 4]
+    containing the encoded box predictions.
+   'raw_outputs/class_predictions': a float32 tensor of shape
+    [1, num_anchors, num_classes] containing the class scores for each anchor
+    after applying score conversion.
+   'anchors': a float32 constant tensor of shape [num_anchors, 4]
+    containing the anchor boxes.
+Example Usage:
+--------------
+python lstm_object_detection/export_tflite_lstd_graph.py \
+    --pipeline_config_path path/to/lstm_pipeline.config \
+    --trained_checkpoint_prefix path/to/model.ckpt \
+    --output_directory path/to/exported_model_directory
+The expected output would be in the directory
+path/to/exported_model_directory (which is created if it does not exist)
+with contents:
+ - tflite_graph.pbtxt
+ - tflite_graph.pb
+Config overrides (see the `config_override` flag) are text protobufs
+(also of type pipeline_pb2.TrainEvalPipelineConfig) which are used to override
+certain fields in the provided pipeline_config_path.  These are useful for
+making small changes to the inference graph that differ from the training or
+eval config.
+Example Usage (in which we change the NMS iou_threshold to be 0.5 and
+NMS score_threshold to be 0.0):
+python lstm_object_detection/export_tflite_lstd_graph.py \
+    --pipeline_config_path path/to/lstm_pipeline.config \
+    --trained_checkpoint_prefix path/to/model.ckpt \
+    --output_directory path/to/exported_model_directory
+    --config_override " \
+            model{ \
+            ssd{ \
+              post_processing { \
+                batch_non_max_suppression { \
+                        score_threshold: 0.0 \
+                        iou_threshold: 0.5 \
+                } \
+             } \
+          } \
+       } \
+       "
+"""
+import tensorflow as tf
+from lstm_object_detection.utils import config_util
+from lstm_object_detection import export_tflite_lstd_graph_lib
+flags = tf.app.flags
+flags.DEFINE_string('output_directory', None, 'Path to write outputs.')
+flags.DEFINE_string(
+    'pipeline_config_path', None,
+    'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
+    'file.')
+flags.DEFINE_string('trained_checkpoint_prefix', None, 'Checkpoint prefix.')
+flags.DEFINE_integer('max_detections', 10,
+                     'Maximum number of detections (boxes) to show.')
+flags.DEFINE_integer('max_classes_per_detection', 1,
+                     'Maximum number of classes to output per detection box.')
+flags.DEFINE_integer(
+    'detections_per_class', 100,
+    'Number of anchors used per class in Regular Non-Max-Suppression.')
+flags.DEFINE_bool('add_postprocessing_op', True,
+                  'Add TFLite custom op for postprocessing to the graph.')
+flags.DEFINE_bool(
+    'use_regular_nms', False,
+    'Flag to set postprocessing op to use Regular NMS instead of Fast NMS.')
+flags.DEFINE_string(
+    'config_override', '', 'pipeline_pb2.TrainEvalPipelineConfig '
+    'text proto to override pipeline_config_path.')
+FLAGS = flags.FLAGS
+def main(argv):
+  del argv  # Unused.
+  flags.mark_flag_as_required('output_directory')
+  flags.mark_flag_as_required('pipeline_config_path')
+  flags.mark_flag_as_required('trained_checkpoint_prefix')
+  pipeline_config = config_util.get_configs_from_pipeline_file(
+    FLAGS.pipeline_config_path)
+  export_tflite_lstd_graph_lib.export_tflite_graph(
+      pipeline_config, FLAGS.trained_checkpoint_prefix, FLAGS.output_directory,
+      FLAGS.add_postprocessing_op, FLAGS.max_detections,
+      FLAGS.max_classes_per_detection, use_regular_nms=FLAGS.use_regular_nms)
+if __name__ == '__main__':
+  tf.app.run(main)
--- a/research/lstm_object_detection/export_tflite_lstd_graph_lib.py
+++ b/research/lstm_object_detection/export_tflite_lstd_graph_lib.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Exports detection models to use with tf-lite.
+See export_tflite_lstd_graph.py for usage.
+"""
+import os
+import tempfile
+import numpy as np
+import tensorflow as tf
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import saver_pb2
+from tensorflow.tools.graph_transforms import TransformGraph
+from object_detection import exporter
+from object_detection.builders import graph_rewriter_builder
+from object_detection.builders import post_processing_builder
+from object_detection.core import box_list
+from lstm_object_detection import model_builder
+_DEFAULT_NUM_CHANNELS = 3
+_DEFAULT_NUM_COORD_BOX = 4
+def get_const_center_size_encoded_anchors(anchors):
+  """Exports center-size encoded anchors as a constant tensor.
+  Args:
+    anchors: a float32 tensor of shape [num_anchors, 4] containing the anchor
+      boxes
+  Returns:
+    encoded_anchors: a float32 constant tensor of shape [num_anchors, 4]
+    containing the anchor boxes.
+  """
+  anchor_boxlist = box_list.BoxList(anchors)
+  y, x, h, w = anchor_boxlist.get_center_coordinates_and_sizes()
+  num_anchors = y.get_shape().as_list()
+  with tf.Session() as sess:
+    y_out, x_out, h_out, w_out = sess.run([y, x, h, w])
+  encoded_anchors = tf.constant(
+      np.transpose(np.stack((y_out, x_out, h_out, w_out))),
+      dtype=tf.float32,
+      shape=[num_anchors[0], _DEFAULT_NUM_COORD_BOX],
+      name='anchors')
+  return encoded_anchors
+def append_postprocessing_op(frozen_graph_def,
+                             max_detections,
+                             max_classes_per_detection,
+                             nms_score_threshold,
+                             nms_iou_threshold,
+                             num_classes,
+                             scale_values,
+                             detections_per_class=100,
+                             use_regular_nms=False):
+  """Appends postprocessing custom op.
+  Args:
+    frozen_graph_def: Frozen GraphDef for SSD model after freezing the
+      checkpoint
+    max_detections: Maximum number of detections (boxes) to show
+    max_classes_per_detection: Number of classes to display per detection
+    nms_score_threshold: Score threshold used in Non-maximal suppression in
+      post-processing
+    nms_iou_threshold: Intersection-over-union threshold used in Non-maximal
+      suppression in post-processing
+    num_classes: number of classes in SSD detector
+    scale_values: scale values is a dict with following key-value pairs
+      {y_scale: 10, x_scale: 10, h_scale: 5, w_scale: 5} that are used in decode
+      centersize boxes
+    detections_per_class: In regular NonMaxSuppression, number of anchors used
+    for NonMaxSuppression per class
+    use_regular_nms: Flag to set postprocessing op to use Regular NMS instead
+      of Fast NMS.
+  Returns:
+    transformed_graph_def: Frozen GraphDef with postprocessing custom op
+    appended
+    TFLite_Detection_PostProcess custom op node has four outputs:
+    detection_boxes: a float32 tensor of shape [1, num_boxes, 4] with box
+    locations
+    detection_classes: a float32 tensor of shape [1, num_boxes]
+    with class indices
+    detection_scores: a float32 tensor of shape [1, num_boxes]
+    with class scores
+    num_boxes: a float32 tensor of size 1 containing the number of detected
+    boxes
+  """
+  new_output = frozen_graph_def.node.add()
+  new_output.op = 'TFLite_Detection_PostProcess'
+  new_output.name = 'TFLite_Detection_PostProcess'
+  new_output.attr['_output_quantized'].CopyFrom(
+      attr_value_pb2.AttrValue(b=True))
+  new_output.attr['_output_types'].list.type.extend([
+      types_pb2.DT_FLOAT, types_pb2.DT_FLOAT, types_pb2.DT_FLOAT,
+      types_pb2.DT_FLOAT
+  ])
+  new_output.attr['_support_output_type_float_in_quantized_op'].CopyFrom(
+      attr_value_pb2.AttrValue(b=True))
+  new_output.attr['max_detections'].CopyFrom(
+      attr_value_pb2.AttrValue(i=max_detections))
+  new_output.attr['max_classes_per_detection'].CopyFrom(
+      attr_value_pb2.AttrValue(i=max_classes_per_detection))
+  new_output.attr['nms_score_threshold'].CopyFrom(
+      attr_value_pb2.AttrValue(f=nms_score_threshold.pop()))
+  new_output.attr['nms_iou_threshold'].CopyFrom(
+      attr_value_pb2.AttrValue(f=nms_iou_threshold.pop()))
+  new_output.attr['num_classes'].CopyFrom(
+      attr_value_pb2.AttrValue(i=num_classes))
+  new_output.attr['y_scale'].CopyFrom(
+      attr_value_pb2.AttrValue(f=scale_values['y_scale'].pop()))
+  new_output.attr['x_scale'].CopyFrom(
+      attr_value_pb2.AttrValue(f=scale_values['x_scale'].pop()))
+  new_output.attr['h_scale'].CopyFrom(
+      attr_value_pb2.AttrValue(f=scale_values['h_scale'].pop()))
+  new_output.attr['w_scale'].CopyFrom(
+      attr_value_pb2.AttrValue(f=scale_values['w_scale'].pop()))
+  new_output.attr['detections_per_class'].CopyFrom(
+      attr_value_pb2.AttrValue(i=detections_per_class))
+  new_output.attr['use_regular_nms'].CopyFrom(
+      attr_value_pb2.AttrValue(b=use_regular_nms))
+  new_output.input.extend(
+      ['raw_outputs/box_encodings', 'raw_outputs/class_predictions', 'anchors'])
+  # Transform the graph to append new postprocessing op
+  input_names = []
+  output_names = ['TFLite_Detection_PostProcess']
+  transforms = ['strip_unused_nodes']
+  transformed_graph_def = TransformGraph(frozen_graph_def, input_names,
+                                         output_names, transforms)
+  return transformed_graph_def
+def export_tflite_graph(pipeline_config,
+                        trained_checkpoint_prefix,
+                        output_dir,
+                        add_postprocessing_op,
+                        max_detections,
+                        max_classes_per_detection,
+                        detections_per_class=100,
+                        use_regular_nms=False,
+                        binary_graph_name='tflite_graph.pb',
+                        txt_graph_name='tflite_graph.pbtxt'):
+  """Exports a tflite compatible graph and anchors for ssd detection model.
+  Anchors are written to a tensor and tflite compatible graph
+  is written to output_dir/tflite_graph.pb.
+  Args:
+    pipeline_config: Dictionary of configuration objects. Keys are `model`, `train_config`,
+      `train_input_config`, `eval_config`, `eval_input_config`, `lstm_model`.
+      Value are the corresponding config objects.
+    trained_checkpoint_prefix: a file prefix for the checkpoint containing the
+      trained parameters of the SSD model.
+    output_dir: A directory to write the tflite graph and anchor file to.
+    add_postprocessing_op: If add_postprocessing_op is true: frozen graph adds a
+      TFLite_Detection_PostProcess custom op
+    max_detections: Maximum number of detections (boxes) to show
+    max_classes_per_detection: Number of classes to display per detection
+    detections_per_class: In regular NonMaxSuppression, number of anchors used
+    for NonMaxSuppression per class
+    use_regular_nms: Flag to set postprocessing op to use Regular NMS instead
+      of Fast NMS.
+    binary_graph_name: Name of the exported graph file in binary format.
+    txt_graph_name: Name of the exported graph file in text format.
+  Raises:
+    ValueError: if the pipeline config contains models other than ssd or uses an
+      fixed_shape_resizer and provides a shape as well.
+  """
+  model_config = pipeline_config['model']
+  lstm_config = pipeline_config['lstm_model']
+  eval_config = pipeline_config['eval_config']
+  tf.gfile.MakeDirs(output_dir)
+  if model_config.WhichOneof('model') != 'ssd':
+    raise ValueError('Only ssd models are supported in tflite. '
+                     'Found {} in config'.format(
+                         model_config.WhichOneof('model')))
+  num_classes = model_config.ssd.num_classes
+  nms_score_threshold = {
+      model_config.ssd.post_processing.batch_non_max_suppression.
+      score_threshold
+  }
+  nms_iou_threshold = {
+      model_config.ssd.post_processing.batch_non_max_suppression.
+      iou_threshold
+  }
+  scale_values = {}
+  scale_values['y_scale'] = {
+      model_config.ssd.box_coder.faster_rcnn_box_coder.y_scale
+  }
+  scale_values['x_scale'] = {
+      model_config.ssd.box_coder.faster_rcnn_box_coder.x_scale
+  }
+  scale_values['h_scale'] = {
+      model_config.ssd.box_coder.faster_rcnn_box_coder.height_scale
+  }
+  scale_values['w_scale'] = {
+      model_config.ssd.box_coder.faster_rcnn_box_coder.width_scale
+  }
+  image_resizer_config = model_config.ssd.image_resizer
+  image_resizer = image_resizer_config.WhichOneof('image_resizer_oneof')
+  num_channels = _DEFAULT_NUM_CHANNELS
+  if image_resizer == 'fixed_shape_resizer':
+    height = image_resizer_config.fixed_shape_resizer.height
+    width = image_resizer_config.fixed_shape_resizer.width
+    if image_resizer_config.fixed_shape_resizer.convert_to_grayscale:
+      num_channels = 1
+    #TODO(richardbrks) figure out how to make with a None defined batch size
+    shape = [lstm_config.eval_unroll_length, height, width, num_channels]
+  else:
+    raise ValueError(
+        'Only fixed_shape_resizer'
+        'is supported with tflite. Found {}'.format(
+            image_resizer_config.WhichOneof('image_resizer_oneof')))
+  video_tensor = tf.placeholder(
+    tf.float32, shape=shape, name='input_video_tensor')
+  detection_model = model_builder.build(model_config, lstm_config,
+    is_training=False)
+  preprocessed_video, true_image_shapes = detection_model.preprocess(
+    tf.to_float(video_tensor))
+  predicted_tensors = detection_model.predict(preprocessed_video,
+    true_image_shapes)
+  # predicted_tensors = detection_model.postprocess(predicted_tensors,
+  #   true_image_shapes)
+  # The score conversion occurs before the post-processing custom op
+  _, score_conversion_fn = post_processing_builder.build(
+      model_config.ssd.post_processing)
+  class_predictions = score_conversion_fn(
+      predicted_tensors['class_predictions_with_background'])
+  with tf.name_scope('raw_outputs'):
+    # 'raw_outputs/box_encodings': a float32 tensor of shape [1, num_anchors, 4]
+    #  containing the encoded box predictions. Note that these are raw
+    #  predictions and no Non-Max suppression is applied on them and
+    #  no decode center size boxes is applied to them.
+    tf.identity(predicted_tensors['box_encodings'], name='box_encodings')
+    # 'raw_outputs/class_predictions': a float32 tensor of shape
+    #  [1, num_anchors, num_classes] containing the class scores for each anchor
+    #  after applying score conversion.
+    tf.identity(class_predictions, name='class_predictions')
+  # 'anchors': a float32 tensor of shape
+  #   [4, num_anchors] containing the anchors as a constant node.
+  tf.identity(
+      get_const_center_size_encoded_anchors(predicted_tensors['anchors']),
+      name='anchors')
+  # Add global step to the graph, so we know the training step number when we
+  # evaluate the model.
+  tf.train.get_or_create_global_step()
+  # graph rewriter
+  is_quantized = ('graph_rewriter' in pipeline_config)
+  if is_quantized:
+    graph_rewriter_config = pipeline_config['graph_rewriter']
+    graph_rewriter_fn = graph_rewriter_builder.build(
+        graph_rewriter_config, is_training=False, is_export=True)
+    graph_rewriter_fn()
+  if model_config.ssd.feature_extractor.HasField('fpn'):
+    exporter.rewrite_nn_resize_op(is_quantized)
+  # freeze the graph
+  saver_kwargs = {}
+  if eval_config.use_moving_averages:
+    saver_kwargs['write_version'] = saver_pb2.SaverDef.V1
+    moving_average_checkpoint = tempfile.NamedTemporaryFile()
+    exporter.replace_variable_values_with_moving_averages(
+        tf.get_default_graph(), trained_checkpoint_prefix,
+        moving_average_checkpoint.name)
+    checkpoint_to_use = moving_average_checkpoint.name
+  else:
+    checkpoint_to_use = trained_checkpoint_prefix
+  saver = tf.train.Saver(**saver_kwargs)
+  input_saver_def = saver.as_saver_def()
+  frozen_graph_def = exporter.freeze_graph_with_def_protos(
+      input_graph_def=tf.get_default_graph().as_graph_def(),
+      input_saver_def=input_saver_def,
+      input_checkpoint=checkpoint_to_use,
+      output_node_names=','.join([
+          'raw_outputs/box_encodings', 'raw_outputs/class_predictions',
+          'anchors'
+      ]),
+      restore_op_name='save/restore_all',
+      filename_tensor_name='save/Const:0',
+      clear_devices=True,
+      output_graph='',
+      initializer_nodes='')
+  # Add new operation to do post processing in a custom op (TF Lite only)
+  #(richardbrks) Do use this or detection_model.postprocess?
+  if add_postprocessing_op:
+    transformed_graph_def = append_postprocessing_op(
+        frozen_graph_def, max_detections, max_classes_per_detection,
+        nms_score_threshold, nms_iou_threshold, num_classes, scale_values,
+        detections_per_class, use_regular_nms)
+  else:
+    # Return frozen without adding post-processing custom op
+    transformed_graph_def = frozen_graph_def
+  binary_graph = os.path.join(output_dir, binary_graph_name)
+  with tf.gfile.GFile(binary_graph, 'wb') as f:
+    f.write(transformed_graph_def.SerializeToString())
+  txt_graph = os.path.join(output_dir, txt_graph_name)
+  with tf.gfile.GFile(txt_graph, 'w') as f:
+    f.write(str(transformed_graph_def))