Merge branch 'master' into move_to_keraslayers_fasterrcnn_fpn_keras_feature_extractor

0cceabfc · Yiming Shi · GitHub · 17821c0d · 39ee0ac9 · 0cceabfc
Unverified Commit 0cceabfc authored Aug 03, 2020 by Yiming Shi Committed by GitHub Aug 03, 2020
20 changed files
--- a/official/recommendation/ncf_common.py
+++ b/official/recommendation/ncf_common.py
@@ -94,7 +94,7 @@ def parse_flags(flags_obj):
      "beta2": flags_obj.beta2,
      "epsilon": flags_obj.epsilon,
      "match_mlperf": flags_obj.ml_perf,
-      "epochs_between_evals": FLAGS.epochs_between_evals,
+      "epochs_between_evals": flags_obj.epochs_between_evals,
      "keras_use_ctl": flags_obj.keras_use_ctl,
      "hr_threshold": flags_obj.hr_threshold,
      "stream_files": flags_obj.tpu is not None,

--- a/official/recommendation/ncf_input_pipeline.py
+++ b/official/recommendation/ncf_input_pipeline.py
@@ -25,43 +25,39 @@ import tensorflow.compat.v2 as tf
 # pylint: enable=g-bad-import-order

 from official.recommendation import constants as rconst
-from official.recommendation import movielens
 from official.recommendation import data_pipeline
-
-NUM_SHARDS = 16
+from official.recommendation import movielens


 def create_dataset_from_tf_record_files(input_file_pattern,
                                        pre_batch_size,
                                        batch_size,
-                                        is_training=True):
+                                        is_training=True,
+                                        rebatch=False):
  """Creates dataset from (tf)records files for training/evaluation."""
+  if pre_batch_size != batch_size:
+    raise ValueError("Pre-batch ({}) size is not equal to batch "
+                     "size ({})".format(pre_batch_size, batch_size))

  files = tf.data.Dataset.list_files(input_file_pattern, shuffle=is_training)

-  def make_dataset(files_dataset, shard_index):
-    """Returns dataset for sharded tf record files."""
-    if pre_batch_size != batch_size:
-      raise ValueError("Pre-batch ({}) size is not equal to batch "
-                       "size ({})".format(pre_batch_size, batch_size))
-    files_dataset = files_dataset.shard(NUM_SHARDS, shard_index)
-    dataset = files_dataset.interleave(
-        tf.data.TFRecordDataset,
-        num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    decode_fn = functools.partial(
-        data_pipeline.DatasetManager.deserialize,
-        batch_size=pre_batch_size,
-        is_training=is_training)
-    dataset = dataset.map(
-        decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    return dataset
-
-  dataset = tf.data.Dataset.range(NUM_SHARDS)
-  map_fn = functools.partial(make_dataset, files)
-  dataset = dataset.interleave(
-      map_fn,
-      cycle_length=NUM_SHARDS,
+  dataset = files.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=16,
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  decode_fn = functools.partial(
+      data_pipeline.DatasetManager.deserialize,
+      batch_size=pre_batch_size,
+      is_training=is_training)
+  dataset = dataset.map(
+      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  if rebatch:
+    # A workaround for TPU Pod evaluation dataset.
+    # TODO (b/162341937) remove once it's fixed.
+    dataset = dataset.unbatch()
+    dataset = dataset.batch(pre_batch_size)
+
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset

@@ -162,12 +158,18 @@ def create_ncf_input_data(params,
        params["train_dataset_path"],
        input_meta_data["train_prebatch_size"],
        params["batch_size"],
-        is_training=True)
+        is_training=True,
+        rebatch=False)
+
+    # Re-batch evaluation dataset for TPU Pods.
+    # TODO (b/162341937) remove once it's fixed.
+    eval_rebatch = (params["use_tpu"] and strategy.num_replicas_in_sync > 8)
    eval_dataset = create_dataset_from_tf_record_files(
        params["eval_dataset_path"],
        input_meta_data["eval_prebatch_size"],
        params["eval_batch_size"],
-        is_training=False)
+        is_training=False,
+        rebatch=eval_rebatch)

    num_train_steps = int(input_meta_data["num_train_steps"])
    num_eval_steps = int(input_meta_data["num_eval_steps"])

--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
@@ -235,6 +235,7 @@ def run_ncf(_):

  params = ncf_common.parse_flags(FLAGS)
  params["distribute_strategy"] = strategy
+  params["use_tpu"] = (FLAGS.distribution_strategy == "tpu")

  if params["use_tpu"] and not params["keras_use_ctl"]:
    logging.error("Custom training loop must be used when using TPUStrategy.")
@@ -488,19 +489,20 @@ def run_ncf_custom_training(params,
        c.on_batch_end(current_step)

    train_loss /= num_train_steps
-    logging.info("Done training epoch %s, epoch loss=%s.", epoch + 1,
+    logging.info("Done training epoch %s, epoch loss=%.3f", epoch + 1,
                 train_loss)

    eval_input_iterator = iter(
        strategy.experimental_distribute_dataset(eval_input_dataset))
-    hr_sum = 0
-    hr_count = 0
+
+    hr_sum = 0.0
+    hr_count = 0.0
    for _ in range(num_eval_steps):
      step_hr_sum, step_hr_count = eval_step(eval_input_iterator)
      hr_sum += step_hr_sum
      hr_count += step_hr_count

-    logging.info("Done eval epoch %s, hit_rate=%s.", epoch + 1,
+    logging.info("Done eval epoch %s, hit_rate=%.3f", epoch + 1,
                 hr_sum / hr_count)
    if eval_summary_writer:
      with eval_summary_writer.as_default():

--- a/official/requirements.txt
+++ b/official/requirements.txt
@@ -3,7 +3,7 @@ google-api-python-client>=1.6.7
 google-cloud-bigquery>=0.31.0
 kaggle>=1.3.9
 numpy>=1.15.4
-oauth2client>=4.1.2
+oauth2client
 pandas>=0.22.0
 psutil>=5.4.3
 py-cpuinfo>=3.3.0
@@ -15,11 +15,13 @@ tensorflow-addons
 dataclasses
 gin-config
 tf_slim>=1.1.0
-typing
-sentencepiece
 Cython
 matplotlib
-opencv-python-headless
 pyyaml
+# CV related dependencies
+opencv-python-headless
 Pillow
-e git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI
+pycocotools
+# NLP related dependencies
+seqeval
+sentencepiece
--- a/official/staging/training/controller_test.py
+++ b/official/staging/training/controller_test.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for official.staging.training.controller."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.staging.training import controller
-from official.staging.training import standard_runnable
-
-
-def all_strategy_combinations():
-  """Gets combinations of distribution strategies."""
-  return combinations.combine(
-      strategy=[
-          strategy_combinations.one_device_strategy,
-          strategy_combinations.tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-      ],
-      mode="eager",
-  )
-
-
-def create_model():
-  x = tf.keras.layers.Input(shape=(3,), name="input")
-  y = tf.keras.layers.Dense(4, name="dense")(x)
-  model = tf.keras.Model(x, y)
-  return model
-
-
-def summaries_with_matching_keyword(keyword, summary_dir):
-  """Yields summary protos matching given keyword from event file."""
-  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, "events*"))
-  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
-    if event.summary is not None:
-      for value in event.summary.value:
-        if keyword in value.tag:
-          tf.compat.v1.logging.error(event)
-          yield event.summary
-
-
-def check_eventfile_for_keyword(keyword, summary_dir):
-  """Checks event files for the keyword."""
-  return any(summaries_with_matching_keyword(keyword, summary_dir))
-
-
-def dataset_fn(ctx):
-  del ctx
-  inputs = np.zeros((10, 3), dtype=np.float32)
-  targets = np.zeros((10, 4), dtype=np.float32)
-  dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-  dataset = dataset.repeat(100)
-  dataset = dataset.batch(10, drop_remainder=True)
-  return dataset
-
-
-class TestRunnable(standard_runnable.StandardTrainable,
-                   standard_runnable.StandardEvaluable):
-  """Implements the training and evaluation APIs for the test model."""
-
-  def __init__(self):
-    standard_runnable.StandardTrainable.__init__(self)
-    standard_runnable.StandardEvaluable.__init__(self)
-    self.strategy = tf.distribute.get_strategy()
-    self.model = create_model()
-    self.optimizer = tf.keras.optimizers.RMSprop()
-    self.global_step = self.optimizer.iterations
-    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
-    self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
-
-  def build_train_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
-
-  def train_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated training step."""
-      inputs, targets = inputs
-      with tf.GradientTape() as tape:
-        outputs = self.model(inputs)
-        loss = tf.math.reduce_sum(outputs - targets)
-      grads = tape.gradient(loss, self.model.variables)
-      self.optimizer.apply_gradients(zip(grads, self.model.variables))
-      self.train_loss.update_state(loss)
-
-    self.strategy.run(_replicated_step, args=(next(iterator),))
-
-  def train_loop_end(self):
-    return {
-        "loss": self.train_loss.result(),
-    }
-
-  def build_eval_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
-
-  def eval_begin(self):
-    self.eval_loss.reset_states()
-
-  def eval_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated evaluation step."""
-      inputs, targets = inputs
-      outputs = self.model(inputs)
-      loss = tf.math.reduce_sum(outputs - targets)
-      self.eval_loss.update_state(loss)
-
-    self.strategy.run(_replicated_step, args=(next(iterator),))
-
-  def eval_end(self):
-    return {
-        "eval_loss": self.eval_loss.result(),
-    }
-
-
-class ControllerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(ControllerTest, self).setUp()
-    self.model_dir = self.get_temp_dir()
-
-  def test_no_checkpoint(self):
-    test_runnable = TestRunnable()
-    # No checkpoint manager and no strategy.
-    test_controller = controller.Controller(
-        train_fn=test_runnable.train,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.train(evaluate=True)
-    self.assertEqual(test_runnable.global_step.numpy(), 10)
-    # Loss and accuracy values should be written into summaries.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-    # No checkpoint, so global step starts from 0.
-    test_runnable.global_step.assign(0)
-    test_controller.train(evaluate=True)
-    self.assertEqual(test_runnable.global_step.numpy(), 10)
-
-  def test_no_checkpoint_and_summaries(self):
-    test_runnable = TestRunnable()
-    # No checkpoint + summary directories.
-    test_controller = controller.Controller(
-        train_fn=test_runnable.train,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.train(evaluate=True)
-    self.assertEqual(test_runnable.global_step.numpy(), 10)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_train_and_evaluate(self, strategy):
-    with strategy.scope():
-      test_runnable = TestRunnable()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runnable.model, optimizer=test_runnable.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runnable.global_step,
-        checkpoint_interval=10)
-    test_controller = controller.Controller(
-        strategy=strategy,
-        train_fn=test_runnable.train,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.train(evaluate=True)
-
-    # Checkpoints are saved.
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
-
-    # Loss and accuracy values should be written into summaries.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-
-  @combinations.generate(all_strategy_combinations())
-  def test_train_only(self, strategy):
-    with strategy.scope():
-      test_runnable = TestRunnable()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runnable.model, optimizer=test_runnable.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runnable.global_step,
-        checkpoint_interval=10)
-    test_controller = controller.Controller(
-        strategy=strategy,
-        train_fn=test_runnable.train,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-    )
-    test_controller.train(evaluate=False)
-
-    # Checkpoints are saved.
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
-
-    # Only train summaries are written.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertFalse(
-        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
-
-  @combinations.generate(all_strategy_combinations())
-  def test_evaluate_only(self, strategy):
-    with strategy.scope():
-      test_runnable = TestRunnable()
-
-    checkpoint = tf.train.Checkpoint(model=test_runnable.model)
-    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
-
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runnable.global_step)
-    test_controller = controller.Controller(
-        strategy=strategy,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        checkpoint_manager=checkpoint_manager,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.evaluate()
-
-    # Only eval summaries are written
-    self.assertFalse(
-        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/vision/detection/README.md
+++ b/official/vision/detection/README.md
@@ -48,6 +48,22 @@ so the checkpoints are not compatible.
 We will unify the implementation soon.


+### Train a SpineNet-49 based RetinaNet.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --params_override="{ type: retinanet, architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
+

 ### Train a custom RetinaNet using the config file.

@@ -123,8 +139,6 @@ predict:
 predict_batch_size: 8
 architecture:
 use_bfloat16: False
-retinanet_parser:
- use_bfloat16: False
 train:
 total_steps: 1
 batch_size: 8
@@ -165,6 +179,24 @@ so the checkpoints are not compatible.
 We will unify the implementation soon.


+### Train a SpineNet-49 based Mask R-CNN.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="{architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
+
+
 ### Train a custom Mask R-CNN using the config file.

 First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
@@ -245,8 +277,6 @@ predict:
 predict_batch_size: 8
 architecture:
 use_bfloat16: False
-maskrcnn_parser:
- use_bfloat16: False
 train:
 total_steps: 1000
 batch_size: 8
@@ -255,6 +285,140 @@ use_tpu: False
 "
 ```

+## Train ShapeMask on TPU
+
+### Train a ResNet-50 based ShapeMask.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+SHAPE_PRIOR_PATH="<path to shape priors>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } shapemask_head: {use_category_for_mask: true, shape_prior_path: ${SHAPE_PRIOR_PATH}} }"
+```
+
+The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
+
+The shape priors can be downloaded [here]
+(https://storage.googleapis.com/cloud-tpu-checkpoints/shapemask/kmeans_class_priors_91x20x32x32.npy)
+
+
+### Train a custom ShapeMask using the config file.
+
+First, create a YAML config file, e.g. *my_shapemask.yaml*.
+This file specifies the parameters to be overridden:
+
+```YAML
+# my_shapemask.yaml
+train:
+  train_file_pattern: <path to the TFRecord training data>
+  total_steps: <total steps to train>
+  batch_size: <training batch size>
+eval:
+  eval_file_pattern: <path to the TFRecord validation data>
+  val_json_file: <path to the validation annotation JSON file>
+  batch_size: <evaluation batch size>
+shapemask_head:
+  shape_prior_path: <path to shape priors>
+```
+
+Once the YAML config file is created, you can launch the training using the
+following command.
+
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=tpu \
+  --tpu=${TPU_NAME} \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --config_file="my_shapemask.yaml"
+```
+
+## Train ShapeMask on GPU
+
+Training on GPU is similar to that on TPU. The major change is the strategy type
+(use
+"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
+for multiple GPU and
+"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
+for single GPU).
+
+Multi-GPUs example (assuming there are 8GPU connected to the host):
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=mirrored \
+  --num_gpus=8 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --config_file="my_shapemask.yaml"
+```
+
+A single GPU example
+
+```bash
+MODEL_DIR="<path to the directory to store model files>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --model_dir=${MODEL_DIR} \
+  --mode=train \
+  --model=shapemask \
+  --config_file="my_shapemask.yaml"
+```
+
+
+An example with inline configuration (YAML or JSON format):
+
+```
+python3 ~/models/official/vision/detection/main.py \
+  --model_dir=<model folder> \
+  --strategy_type=one_device \
+  --num_gpus=1 \
+  --mode=train \
+  --model=shapemask \
+  --params_override="eval:
+ eval_file_pattern: <Eval TFRecord file pattern>
+ batch_size: 8
+ val_json_file: <COCO format groundtruth JSON file>
+train:
+ total_steps: 1000
+ batch_size: 8
+ train_file_pattern: <Eval TFRecord file pattern>
+use_tpu: False
+"
+```
+
+
+### Run the evaluation (after training)
+
+```
+python3 /usr/share/models/official/vision/detection/main.py \
+   --strategy_type=tpu \
+   --tpu=${TPU_NAME} \
+   --model_dir=${MODEL_DIR} \
+   --mode=eval \
+   --model=shapemask \
+   --params_override="{eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN}, eval_samples: 5000 } }"
+```
+
+`MODEL_DIR` needs to point to the trained path of ShapeMask model.
+Change `strategy_type=mirrored` and `num_gpus=1` to run on a GPU.
+
 Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
 downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.


--- a/official/vision/detection/configs/base_config.py
+++ b/official/vision/detection/configs/base_config.py
@@ -17,10 +17,12 @@

 BACKBONES = [
    'resnet',
+    'spinenet',
 ]

 MULTILEVEL_FEATURES = [
    'fpn',
+    'identity',
 ]

 # pylint: disable=line-too-long
@@ -118,6 +120,9 @@ BASE_CFG = {
    'resnet': {
        'resnet_depth': 50,
    },
+    'spinenet': {
+        'model_id': '49',
+    },
    'fpn': {
        'fpn_feat_dims': 256,
        'use_separable_conv': False,

--- a/official/vision/detection/dataloader/anchor.py
+++ b/official/vision/detection/dataloader/anchor.py
@@ -46,15 +46,15 @@ class Anchor(object):
      num_scales: integer number representing intermediate scales added
        on each level. For instances, num_scales=2 adds one additional
        intermediate anchor scales [2^0, 2^0.5] on each level.
-      aspect_ratios: list of float numbers representing the aspect raito anchors
+      aspect_ratios: list of float numbers representing the aspect ratio anchors
        added on each level. The number indicates the ratio of width to height.
        For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
        scale level.
      anchor_size: float number representing the scale of size of the base
        anchor to the feature stride 2^level.
      image_size: a list of integer numbers or Tensors representing
-        [height, width] of the input image size.The image_size should be divided
-        by the largest feature stride 2^max_level.
+        [height, width] of the input image size.The image_size should be
+        divisible by the largest feature stride 2^max_level.
    """
    self.min_level = min_level
    self.max_level = max_level
@@ -77,8 +77,8 @@ class Anchor(object):
      for scale in range(self.num_scales):
        for aspect_ratio in self.aspect_ratios:
          stride = 2 ** level
-          intermidate_scale = 2 ** (scale / float(self.num_scales))
-          base_anchor_size = self.anchor_size * stride * intermidate_scale
+          intermediate_scale = 2 ** (scale / float(self.num_scales))
+          base_anchor_size = self.anchor_size * stride * intermediate_scale
          aspect_x = aspect_ratio ** 0.5
          aspect_y = aspect_ratio ** -0.5
          half_anchor_size_x = base_anchor_size * aspect_x / 2.0

--- a/official/vision/detection/dataloader/maskrcnn_parser.py
+++ b/official/vision/detection/dataloader/maskrcnn_parser.py
@@ -185,12 +185,12 @@ class Parser(object):
    is_crowds = data['groundtruth_is_crowd']
    # Skips annotations with `is_crowd` = True.
    if self._skip_crowd_during_training and self._is_training:
-      num_groundtrtuhs = tf.shape(classes)[0]
-      with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
+      num_groundtruths = tf.shape(classes)[0]
+      with tf.control_dependencies([num_groundtruths, is_crowds]):
        indices = tf.cond(
            tf.greater(tf.size(is_crowds), 0),
            lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
-            lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
+            lambda: tf.cast(tf.range(num_groundtruths), tf.int64))
      classes = tf.gather(classes, indices)
      boxes = tf.gather(boxes, indices)
      if self._include_mask:

--- a/official/vision/detection/evaluation/coco_utils.py
+++ b/official/vision/detection/evaluation/coco_utils.py
@@ -237,7 +237,7 @@ def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None):
              (boxes[j, k, 3] - boxes[j, k, 1]) *
              (boxes[j, k, 2] - boxes[j, k, 0]))
        if 'masks' in groundtruths:
-          mask = Image.open(six.StringIO(groundtruths['masks'][i][j, k]))
+          mask = Image.open(six.BytesIO(groundtruths['masks'][i][j, k]))
          width, height = mask.size
          np_mask = (
              np.array(mask.getdata()).reshape(height, width).astype(np.uint8))

--- a/official/vision/detection/main.py
+++ b/official/vision/detection/main.py
@@ -19,25 +19,28 @@ from __future__ import division
 # from __future__ import google_type_annotations
 from __future__ import print_function

-from absl import app
-from absl import flags
-from absl import logging
 import functools
-import os
 import pprint
+
+# pylint: disable=g-bad-import-order
 import tensorflow as tf

+from absl import app
+from absl import flags
+from absl import logging
+# pylint: enable=g-bad-import-order
+
 from official.modeling.hyperparams import params_dict
 from official.modeling.training import distributed_executor as executor
 from official.utils import hyperparams_flags
+from official.utils.flags import core as flags_core
+from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils
 from official.vision.detection.configs import factory as config_factory
 from official.vision.detection.dataloader import input_reader
 from official.vision.detection.dataloader import mode_keys as ModeKeys
 from official.vision.detection.executor.detection_executor import DetectionDistributedExecutor
 from official.vision.detection.modeling import factory as model_factory
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import keras_utils

 hyperparams_flags.initialize_common_flags()
 flags_core.define_log_steps()
@@ -194,6 +197,20 @@ def run(callbacks=None):
          'strategy_config': executor.strategy_flags_dict(),
      },
      is_strict=False)
+
+  # Make sure use_tpu and strategy_type are in sync.
+  params.use_tpu = (params.strategy_type == 'tpu')
+
+  if not params.use_tpu:
+    params.override({
+        'architecture': {
+            'use_bfloat16': False,
+        },
+        'norm_activation': {
+            'use_sync_bn': False,
+        },
+    }, is_strict=True)
+
  params.validate()
  params.lock()
  pp = pprint.PrettyPrinter()

--- a/official/vision/detection/modeling/architecture/factory.py
+++ b/official/vision/detection/modeling/architecture/factory.py
@@ -23,6 +23,7 @@ from official.vision.detection.modeling.architecture import heads
 from official.vision.detection.modeling.architecture import identity
 from official.vision.detection.modeling.architecture import nn_ops
 from official.vision.detection.modeling.architecture import resnet
+from official.vision.detection.modeling.architecture import spinenet


 def norm_activation_generator(params):
@@ -42,6 +43,9 @@ def backbone_generator(params):
        activation=params.norm_activation.activation,
        norm_activation=norm_activation_generator(
            params.norm_activation))
+  elif params.architecture.backbone == 'spinenet':
+    spinenet_params = params.spinenet
+    backbone_fn = spinenet.SpineNetBuilder(model_id=spinenet_params.model_id)
  else:
    raise ValueError('Backbone model `{}` is not supported.'
                     .format(params.architecture.backbone))

--- a/official/vision/detection/modeling/architecture/fpn.py
+++ b/official/vision/detection/modeling/architecture/fpn.py
@@ -28,7 +28,7 @@ import functools

 import tensorflow as tf

-from tensorflow.python.keras import backend
+from official.vision.detection.modeling.architecture import keras_utils
 from official.vision.detection.modeling.architecture import nn_ops
 from official.vision.detection.ops import spatial_transform_ops

@@ -120,7 +120,7 @@ class Fpn(object):
          'The minimum backbone level %d should be '%(min(input_levels)) +
          'less or equal to FPN minimum level %d.:'%(self._min_level))
    backbone_max_level = min(max(input_levels), self._max_level)
-    with backend.get_graph().as_default(), tf.name_scope('fpn'):
+    with keras_utils.maybe_enter_backend_graph(), tf.name_scope('fpn'):
      # Adds lateral connections.
      feats_lateral = {}
      for level in range(self._min_level, backbone_max_level + 1):

--- a/official/vision/detection/modeling/architecture/heads.py
+++ b/official/vision/detection/modeling/architecture/heads.py
@@ -22,7 +22,8 @@ import functools

 import numpy as np
 import tensorflow as tf
-from tensorflow.python.keras import backend
+
+from official.vision.detection.modeling.architecture import keras_utils
 from official.vision.detection.modeling.architecture import nn_ops
 from official.vision.detection.ops import spatial_transform_ops

@@ -127,7 +128,7 @@ class RpnHead(tf.keras.layers.Layer):
    scores_outputs = {}
    box_outputs = {}

-    with backend.get_graph().as_default(), tf.name_scope('rpn_head'):
+    with keras_utils.maybe_enter_backend_graph(), tf.name_scope('rpn_head'):
      for level in range(self._min_level, self._max_level + 1):
        scores_output, box_output = self._shared_rpn_heads(
            features[level], self._anchors_per_location, level, is_training)
@@ -249,7 +250,8 @@ class FastrcnnHead(tf.keras.layers.Layer):
        predictions.
    """

-    with backend.get_graph().as_default(), tf.name_scope('fast_rcnn_head'):
+    with keras_utils.maybe_enter_backend_graph(), tf.name_scope(
+        'fast_rcnn_head'):
      # reshape inputs beofre FC.
      _, num_rois, height, width, filters = roi_features.get_shape().as_list()

@@ -368,7 +370,7 @@ class MaskrcnnHead(tf.keras.layers.Layer):
        boxes is not 4.
    """

-    with backend.get_graph().as_default():
+    with keras_utils.maybe_enter_backend_graph():
      with tf.name_scope('mask_head'):
        _, num_rois, height, width, filters = roi_features.get_shape().as_list()
        net = tf.reshape(roi_features, [-1, height, width, filters])
@@ -552,7 +554,8 @@ class RetinanetHead(object):
    """Returns outputs of RetinaNet head."""
    class_outputs = {}
    box_outputs = {}
-    with backend.get_graph().as_default(), tf.name_scope('retinanet_head'):
+    with keras_utils.maybe_enter_backend_graph(), tf.name_scope(
+        'retinanet_head'):
      for level in range(self._min_level, self._max_level + 1):
        features = fpn_features[level]

@@ -644,7 +647,7 @@ class ShapemaskPriorHead(object):
      detection_priors: A float Tensor of shape [batch_size * num_instances,
        mask_size, mask_size, 1].
    """
-    with backend.get_graph().as_default(), tf.name_scope('prior_mask'):
+    with keras_utils.maybe_enter_backend_graph(), tf.name_scope('prior_mask'):
      batch_size, num_instances, _ = boxes.get_shape().as_list()
      outer_boxes = tf.cast(outer_boxes, tf.float32)
      boxes = tf.cast(boxes, tf.float32)
@@ -807,7 +810,7 @@ class ShapemaskCoarsemaskHead(object):
      mask_outputs: instance mask prediction as a float Tensor of shape
        [batch_size, num_instances, mask_size, mask_size].
    """
-    with backend.get_graph().as_default(), tf.name_scope('coarse_mask'):
+    with keras_utils.maybe_enter_backend_graph(), tf.name_scope('coarse_mask'):
      # Transform detection priors to have the same dimension as features.
      detection_priors = tf.expand_dims(detection_priors, axis=-1)
      detection_priors = self._coarse_mask_fc(detection_priors)
@@ -939,7 +942,7 @@ class ShapemaskFinemaskHead(object):
    """
    # Extract the foreground mean features
    # with tf.variable_scope('fine_mask', reuse=tf.AUTO_REUSE):
-    with backend.get_graph().as_default(), tf.name_scope('fine_mask'):
+    with keras_utils.maybe_enter_backend_graph(), tf.name_scope('fine_mask'):
      mask_probs = tf.nn.sigmoid(mask_logits)
      # Compute instance embedding for hard average.
      binary_mask = tf.cast(tf.greater(mask_probs, 0.5), features.dtype)

--- a/research/deep_contextual_bandits/bandits/core/bayesian_nn.py
+++ b/research/deep_contextual_bandits/bandits/core/bayesian_nn.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,25 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Define the abstract class for Bayesian Neural Networks."""
+"""Util functions to integrate with Keras internals."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function


-class BayesianNN(object):
-  """A Bayesian neural network keeps a distribution over neural nets."""
+from tensorflow.python.keras import backend

-  def __init__(self, optimizer):
-    pass
+try:
+  from tensorflow.python.keras.engine import keras_tensor  # pylint: disable=g-import-not-at-top,unused-import
+  keras_tensor.disable_keras_tensors()
+except ImportError:
+  keras_tensor = None

-  def build_model(self):
-    pass

-  def train(self, data):
+class NoOpContextManager(object):
+
+  def __enter__(self):
    pass

-  def sample(self, steps):
+  def __exit__(self, *args):
    pass
+
+
+def maybe_enter_backend_graph():
+  if (keras_tensor is not None) and keras_tensor.keras_tensors_enabled():
+    return NoOpContextManager()
+  else:
+    return backend.get_graph().as_default()
--- a/official/vision/detection/modeling/architecture/nn_blocks.py
+++ b/official/vision/detection/modeling/architecture/nn_blocks.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains common building blocks for neural networks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from official.modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ResidualBlock(tf.keras.layers.Layer):
+  """A residual block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """A residual block with BN after convolutions.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(ResidualBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    super(ResidualBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+
+    base_config = super(ResidualBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+
+    return self._activation_fn(x + shortcut)
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckBlock(tf.keras.layers.Layer):
+  """A standard bottleneck block."""
+
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """A standard bottleneck block with BN after convolutions.
+
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(BottleneckBlock, self).__init__(**kwargs)
+
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters * 4,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    self._conv3 = tf.keras.layers.Conv2D(
+        filters=self._filters * 4,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm3 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+
+    super(BottleneckBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+
+    base_config = super(BottleneckBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+
+    x = self._conv2(x)
+    x = self._norm2(x)
+    x = self._activation_fn(x)
+
+    x = self._conv3(x)
+    x = self._norm3(x)
+
+    return self._activation_fn(x + shortcut)
--- a/official/vision/detection/modeling/architecture/resnet.py
+++ b/official/vision/detection/modeling/architecture/resnet.py
@@ -25,7 +25,7 @@ from __future__ import print_function

 from absl import logging
 import tensorflow as tf
-from tensorflow.python.keras import backend
+from official.vision.detection.modeling.architecture import keras_utils
 from official.vision.detection.modeling.architecture import nn_ops

 # TODO(b/140112644): Refactor the code with Keras style, i.e. build and call.
@@ -90,7 +90,7 @@ class Resnet(object):
      The values are corresponding feature hierarchy in ResNet with shape
      [batch_size, height_l, width_l, num_filters].
    """
-    with backend.get_graph().as_default():
+    with keras_utils.maybe_enter_backend_graph():
      with tf.name_scope('resnet%s' % self._resnet_depth):
        return self._resnet_fn(inputs, is_training)


--- a/official/vision/detection/modeling/architecture/spinenet.py
+++ b/official/vision/detection/modeling/architecture/spinenet.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of SpineNet model.
+
+X. Du, T-Y. Lin, P. Jin, G. Ghiasi, M. Tan, Y. Cui, Q. V. Le, X. Song
+SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization
+https://arxiv.org/abs/1912.05027
+"""
+import math
+
+from absl import logging
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.vision.detection.modeling.architecture import keras_utils
+from official.vision.detection.modeling.architecture import nn_blocks
+
+layers = tf.keras.layers
+
+FILTER_SIZE_MAP = {
+    1: 32,
+    2: 64,
+    3: 128,
+    4: 256,
+    5: 256,
+    6: 256,
+    7: 256,
+}
+
+# The fixed SpineNet architecture discovered by NAS.
+# Each element represents a specification of a building block:
+#   (block_level, block_fn, (input_offset0, input_offset1), is_output).
+SPINENET_BLOCK_SPECS = [
+    (2, 'bottleneck', (0, 1), False),
+    (4, 'residual', (0, 1), False),
+    (3, 'bottleneck', (2, 3), False),
+    (4, 'bottleneck', (2, 4), False),
+    (6, 'residual', (3, 5), False),
+    (4, 'bottleneck', (3, 5), False),
+    (5, 'residual', (6, 7), False),
+    (7, 'residual', (6, 8), False),
+    (5, 'bottleneck', (8, 9), False),
+    (5, 'bottleneck', (8, 10), False),
+    (4, 'bottleneck', (5, 10), True),
+    (3, 'bottleneck', (4, 10), True),
+    (5, 'bottleneck', (7, 12), True),
+    (7, 'bottleneck', (5, 14), True),
+    (6, 'bottleneck', (12, 14), True),
+]
+
+SCALING_MAP = {
+    '49S': {
+        'endpoints_num_filters': 128,
+        'filter_size_scale': 0.65,
+        'resample_alpha': 0.5,
+        'block_repeats': 1,
+    },
+    '49': {
+        'endpoints_num_filters': 256,
+        'filter_size_scale': 1.0,
+        'resample_alpha': 0.5,
+        'block_repeats': 1,
+    },
+    '96': {
+        'endpoints_num_filters': 256,
+        'filter_size_scale': 1.0,
+        'resample_alpha': 0.5,
+        'block_repeats': 2,
+    },
+    '143': {
+        'endpoints_num_filters': 256,
+        'filter_size_scale': 1.0,
+        'resample_alpha': 1.0,
+        'block_repeats': 3,
+    },
+    '190': {
+        'endpoints_num_filters': 512,
+        'filter_size_scale': 1.3,
+        'resample_alpha': 1.0,
+        'block_repeats': 4,
+    },
+}
+
+
+class BlockSpec(object):
+  """A container class that specifies the block configuration for SpineNet."""
+
+  def __init__(self, level, block_fn, input_offsets, is_output):
+    self.level = level
+    self.block_fn = block_fn
+    self.input_offsets = input_offsets
+    self.is_output = is_output
+
+
+def build_block_specs(block_specs=None):
+  """Builds the list of BlockSpec objects for SpineNet."""
+  if not block_specs:
+    block_specs = SPINENET_BLOCK_SPECS
+  logging.info('Building SpineNet block specs: %s', block_specs)
+  return [BlockSpec(*b) for b in block_specs]
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SpineNet(tf.keras.Model):
+  """Class to build SpineNet models."""
+
+  def __init__(self,
+               input_specs=tf.keras.layers.InputSpec(shape=[None, 640, 640, 3]),
+               min_level=3,
+               max_level=7,
+               block_specs=build_block_specs(),
+               endpoints_num_filters=256,
+               resample_alpha=0.5,
+               block_repeats=1,
+               filter_size_scale=1.0,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """SpineNet model."""
+    self._min_level = min_level
+    self._max_level = max_level
+    self._block_specs = block_specs
+    self._endpoints_num_filters = endpoints_num_filters
+    self._resample_alpha = resample_alpha
+    self._block_repeats = block_repeats
+    self._filter_size_scale = filter_size_scale
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    if activation == 'relu':
+      self._activation = tf.nn.relu
+    elif activation == 'swish':
+      self._activation = tf.nn.swish
+    else:
+      raise ValueError('Activation {} not implemented.'.format(activation))
+    self._init_block_fn = 'bottleneck'
+    self._num_init_blocks = 2
+
+    if use_sync_bn:
+      self._norm = layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = layers.BatchNormalization
+
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+
+    # Build SpineNet.
+    inputs = tf.keras.Input(shape=input_specs.shape[1:])
+
+    net = self._build_stem(inputs=inputs)
+    net = self._build_scale_permuted_network(
+        net=net, input_width=input_specs.shape[1])
+    net = self._build_endpoints(net=net)
+
+    super(SpineNet, self).__init__(inputs=inputs, outputs=net)
+
+  def _block_group(self,
+                   inputs,
+                   filters,
+                   strides,
+                   block_fn_cand,
+                   block_repeats=1,
+                   name='block_group'):
+    """Creates one group of blocks for the SpineNet model."""
+    block_fn_candidates = {
+        'bottleneck': nn_blocks.BottleneckBlock,
+        'residual': nn_blocks.ResidualBlock,
+    }
+    block_fn = block_fn_candidates[block_fn_cand]
+    _, _, _, num_filters = inputs.get_shape().as_list()
+
+    if block_fn_cand == 'bottleneck':
+      use_projection = not (num_filters == (filters * 4) and strides == 1)
+    else:
+      use_projection = not (num_filters == filters and strides == 1)
+
+    x = block_fn(
+        filters=filters,
+        strides=strides,
+        use_projection=use_projection,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        norm_momentum=self._norm_momentum,
+        norm_epsilon=self._norm_epsilon)(
+            inputs)
+    for _ in range(1, block_repeats):
+      x = block_fn(
+          filters=filters,
+          strides=1,
+          use_projection=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          activation=self._activation,
+          use_sync_bn=self._use_sync_bn,
+          norm_momentum=self._norm_momentum,
+          norm_epsilon=self._norm_epsilon)(
+              x)
+    return tf.identity(x, name=name)
+
+  def _build_stem(self, inputs):
+    """Build SpineNet stem."""
+    x = layers.Conv2D(
+        filters=64,
+        kernel_size=7,
+        strides=2,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            inputs)
+    x = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)(
+            x)
+    x = tf_utils.get_activation(self._activation)(x)
+    x = layers.MaxPool2D(pool_size=3, strides=2, padding='same')(x)
+
+    net = []
+    # Build the initial level 2 blocks.
+    for i in range(self._num_init_blocks):
+      x = self._block_group(
+          inputs=x,
+          filters=int(FILTER_SIZE_MAP[2] * self._filter_size_scale),
+          strides=1,
+          block_fn_cand=self._init_block_fn,
+          block_repeats=self._block_repeats,
+          name='stem_block_{}'.format(i + 1))
+      net.append(x)
+    return net
+
+  def _build_scale_permuted_network(self,
+                                    net,
+                                    input_width,
+                                    weighted_fusion=False):
+    """Build scale-permuted network."""
+    net_sizes = [int(math.ceil(input_width / 2**2))] * len(net)
+    net_block_fns = [self._init_block_fn] * len(net)
+    num_outgoing_connections = [0] * len(net)
+
+    endpoints = {}
+    for i, block_spec in enumerate(self._block_specs):
+      # Find out specs for the target block.
+      target_width = int(math.ceil(input_width / 2**block_spec.level))
+      target_num_filters = int(FILTER_SIZE_MAP[block_spec.level] *
+                               self._filter_size_scale)
+      target_block_fn = block_spec.block_fn
+
+      # Resample then merge input0 and input1.
+      parents = []
+      input0 = block_spec.input_offsets[0]
+      input1 = block_spec.input_offsets[1]
+
+      x0 = self._resample_with_alpha(
+          inputs=net[input0],
+          input_width=net_sizes[input0],
+          input_block_fn=net_block_fns[input0],
+          target_width=target_width,
+          target_num_filters=target_num_filters,
+          target_block_fn=target_block_fn,
+          alpha=self._resample_alpha)
+      parents.append(x0)
+      num_outgoing_connections[input0] += 1
+
+      x1 = self._resample_with_alpha(
+          inputs=net[input1],
+          input_width=net_sizes[input1],
+          input_block_fn=net_block_fns[input1],
+          target_width=target_width,
+          target_num_filters=target_num_filters,
+          target_block_fn=target_block_fn,
+          alpha=self._resample_alpha)
+      parents.append(x1)
+      num_outgoing_connections[input1] += 1
+
+      # Merge 0 outdegree blocks to the output block.
+      if block_spec.is_output:
+        for j, (j_feat,
+                j_connections) in enumerate(zip(net, num_outgoing_connections)):
+          if j_connections == 0 and (j_feat.shape[2] == target_width and
+                                     j_feat.shape[3] == x0.shape[3]):
+            parents.append(j_feat)
+            num_outgoing_connections[j] += 1
+
+      # pylint: disable=g-direct-tensorflow-import
+      if weighted_fusion:
+        dtype = parents[0].dtype
+        parent_weights = [
+            tf.nn.relu(tf.cast(tf.Variable(1.0, name='block{}_fusion{}'.format(
+                i, j)), dtype=dtype)) for j in range(len(parents))]
+        weights_sum = tf.add_n(parent_weights)
+        parents = [
+            parents[i] * parent_weights[i] / (weights_sum + 0.0001)
+            for i in range(len(parents))
+        ]
+
+      # Fuse all parent nodes then build a new block.
+      x = tf_utils.get_activation(self._activation)(tf.add_n(parents))
+      x = self._block_group(
+          inputs=x,
+          filters=target_num_filters,
+          strides=1,
+          block_fn_cand=target_block_fn,
+          block_repeats=self._block_repeats,
+          name='scale_permuted_block_{}'.format(i + 1))
+
+      net.append(x)
+      net_sizes.append(target_width)
+      net_block_fns.append(target_block_fn)
+      num_outgoing_connections.append(0)
+
+      # Save output feats.
+      if block_spec.is_output:
+        if block_spec.level in endpoints:
+          raise ValueError('Duplicate feats found for output level {}.'.format(
+              block_spec.level))
+        if (block_spec.level < self._min_level or
+            block_spec.level > self._max_level):
+          raise ValueError('Output level is out of range [{}, {}]'.format(
+              self._min_level, self._max_level))
+        endpoints[block_spec.level] = x
+
+    return endpoints
+
+  def _build_endpoints(self, net):
+    """Match filter size for endpoints before sharing conv layers."""
+    endpoints = {}
+    for level in range(self._min_level, self._max_level + 1):
+      x = layers.Conv2D(
+          filters=self._endpoints_num_filters,
+          kernel_size=1,
+          strides=1,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)(
+              net[level])
+      x = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)(
+              x)
+      x = tf_utils.get_activation(self._activation)(x)
+      endpoints[level] = x
+    return endpoints
+
+  def _resample_with_alpha(self,
+                           inputs,
+                           input_width,
+                           input_block_fn,
+                           target_width,
+                           target_num_filters,
+                           target_block_fn,
+                           alpha=0.5):
+    """Match resolution and feature dimension."""
+    _, _, _, input_num_filters = inputs.get_shape().as_list()
+    if input_block_fn == 'bottleneck':
+      input_num_filters /= 4
+    new_num_filters = int(input_num_filters * alpha)
+
+    x = layers.Conv2D(
+        filters=new_num_filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            inputs)
+    x = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)(
+            x)
+    x = tf_utils.get_activation(self._activation)(x)
+
+    # Spatial resampling.
+    if input_width > target_width:
+      x = layers.Conv2D(
+          filters=new_num_filters,
+          kernel_size=3,
+          strides=2,
+          padding='SAME',
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)(
+              x)
+      x = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)(
+              x)
+      x = tf_utils.get_activation(self._activation)(x)
+      input_width /= 2
+      while input_width > target_width:
+        x = layers.MaxPool2D(pool_size=3, strides=2, padding='SAME')(x)
+        input_width /= 2
+    elif input_width < target_width:
+      scale = target_width // input_width
+      x = layers.UpSampling2D(size=(scale, scale))(x)
+
+    # Last 1x1 conv to match filter size.
+    if target_block_fn == 'bottleneck':
+      target_num_filters *= 4
+    x = layers.Conv2D(
+        filters=target_num_filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            x)
+    x = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)(
+            x)
+
+    return x
+
+
+class SpineNetBuilder(object):
+  """SpineNet builder."""
+
+  def __init__(self,
+               model_id,
+               input_specs=tf.keras.layers.InputSpec(shape=[None, 640, 640, 3]),
+               min_level=3,
+               max_level=7,
+               block_specs=build_block_specs(),
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001):
+    if model_id not in SCALING_MAP:
+      raise ValueError(
+          'SpineNet {} is not a valid architecture.'.format(model_id))
+    scaling_params = SCALING_MAP[model_id]
+    self._input_specs = input_specs
+    self._min_level = min_level
+    self._max_level = max_level
+    self._block_specs = block_specs
+    self._endpoints_num_filters = scaling_params['endpoints_num_filters']
+    self._resample_alpha = scaling_params['resample_alpha']
+    self._block_repeats = scaling_params['block_repeats']
+    self._filter_size_scale = scaling_params['filter_size_scale']
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+  def __call__(self, inputs, is_training=None):
+    with keras_utils.maybe_enter_backend_graph():
+      model = SpineNet(
+          input_specs=self._input_specs,
+          min_level=self._min_level,
+          max_level=self._max_level,
+          block_specs=self._block_specs,
+          endpoints_num_filters=self._endpoints_num_filters,
+          resample_alpha=self._resample_alpha,
+          block_repeats=self._block_repeats,
+          filter_size_scale=self._filter_size_scale,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          activation=self._activation,
+          use_sync_bn=self._use_sync_bn,
+          norm_momentum=self._norm_momentum,
+          norm_epsilon=self._norm_epsilon)
+      return model(inputs)
--- a/official/vision/detection/modeling/losses.py
+++ b/official/vision/detection/modeling/losses.py
@@ -449,7 +449,7 @@ class RetinanetBoxLoss(object):
      num_positives: number of positive examples in the minibatch.

    Returns:
-      an integar tensor representing total box regression loss.
+      an integer tensor representing total box regression loss.
    """
    # Sums all positives in a batch for normalization and avoids zero
    # num_positives_sum, which would lead to inf loss during training
@@ -457,7 +457,6 @@ class RetinanetBoxLoss(object):

    box_losses = []
    for level in box_outputs.keys():
-      # Onehot encoding for classification labels.
      box_targets_l = labels[level]
      box_losses.append(
          self.box_loss(box_outputs[level], box_targets_l, num_positives_sum))

--- a/official/vision/detection/modeling/maskrcnn_model.py
+++ b/official/vision/detection/modeling/maskrcnn_model.py
@@ -20,13 +20,13 @@ from __future__ import print_function

 import tensorflow as tf

-from tensorflow.python.keras import backend
 from official.vision.detection.dataloader import anchor
 from official.vision.detection.dataloader import mode_keys
 from official.vision.detection.evaluation import factory as eval_factory
 from official.vision.detection.modeling import base_model
 from official.vision.detection.modeling import losses
 from official.vision.detection.modeling.architecture import factory
+from official.vision.detection.modeling.architecture import keras_utils
 from official.vision.detection.ops import postprocess_ops
 from official.vision.detection.ops import roi_ops
 from official.vision.detection.ops import spatial_transform_ops
@@ -297,7 +297,7 @@ class MaskrcnnModel(base_model.Model):
  def build_model(self, params, mode):
    if self._keras_model is None:
      input_layers = self.build_input_layers(self._params, mode)
-      with backend.get_graph().as_default():
+      with keras_utils.maybe_enter_backend_graph():
        outputs = self.model_outputs(input_layers, mode)

        model = tf.keras.models.Model(