pull latest

657dcda5 · Kaushik Shivakumar · 26e24e21 · e6017471 · 657dcda5 · 657dcda5
Commit 657dcda5 authored Jul 01, 2020 by Kaushik Shivakumar
20 changed files
--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -26,7 +26,6 @@ from official.core import base_task
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import bert
 from official.nlp.data import sentence_prediction_dataloader
-from official.nlp.modeling import losses as loss_lib
 from official.nlp.tasks import utils
@@ -36,6 +35,7 @@ class SentencePredictionConfig(cfg.TaskConfig):
  # At most one of `init_checkpoint` and `hub_module_url` can
  # be specified.
  init_checkpoint: str = ''
+  init_cls_pooler: bool = False
  hub_module_url: str = ''
  metric_type: str = 'accuracy'
  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(
@@ -55,11 +55,11 @@ class SentencePredictionConfig(cfg.TaskConfig):
 class SentencePredictionTask(base_task.Task):
  """Task object for sentence_prediction."""
-  def __init__(self, params=cfg.TaskConfig):
+  def __init__(self, params=cfg.TaskConfig, logging_dir=None):
-    super(SentencePredictionTask, self).__init__(params)
+    super(SentencePredictionTask, self).__init__(params, logging_dir)
    if params.hub_module_url and params.init_checkpoint:
      raise ValueError('At most one of `hub_module_url` and '
-                       '`pretrain_checkpoint_dir` can be specified.')
+                       '`init_checkpoint` can be specified.')
    if params.hub_module_url:
      self._hub_module = hub.load(params.hub_module_url)
    else:
@@ -75,10 +75,10 @@ class SentencePredictionTask(base_task.Task):
      return bert.instantiate_bertpretrainer_from_cfg(self.task_config.model)
  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
+    loss = tf.keras.losses.sparse_categorical_crossentropy(
-        labels=labels,
+        labels,
-        predictions=tf.nn.log_softmax(
+        tf.cast(model_outputs['sentence_prediction'], tf.float32),
-            tf.cast(model_outputs['sentence_prediction'], tf.float32), axis=-1))
+        from_logits=True)
    if aux_losses:
      loss += tf.add_n(aux_losses)
@@ -94,7 +94,7 @@ class SentencePredictionTask(base_task.Task):
            input_word_ids=dummy_ids,
            input_mask=dummy_ids,
            input_type_ids=dummy_ids)
-        y = tf.ones((1, 1), dtype=tf.int32)
+        y = tf.zeros((1, 1), dtype=tf.int32)
        return (x, y)
      dataset = tf.data.Dataset.range(1)
@@ -126,25 +126,26 @@ class SentencePredictionTask(base_task.Task):
    outputs = self.inference_step(features, model)
    loss = self.build_losses(
        labels=labels, model_outputs=outputs, aux_losses=model.losses)
+    logs = {self.loss: loss}
    if self.metric_type == 'matthews_corrcoef':
-      return {
+      logs.update({
-          self.loss:
-              loss,
          'sentence_prediction':
              tf.expand_dims(
                  tf.math.argmax(outputs['sentence_prediction'], axis=1),
                  axis=0),
          'labels':
              labels,
-      }
+      })
    if self.metric_type == 'pearson_spearman_corr':
-      return {
+      logs.update({
-          self.loss: loss,
          'sentence_prediction': outputs['sentence_prediction'],
          'labels': labels,
-      }
+      })
+    return logs
  def aggregate_logs(self, state=None, step_outputs=None):
+    if self.metric_type == 'accuracy':
+      return None
    if state is None:
      state = {'sentence_prediction': [], 'labels': []}
    state['sentence_prediction'].append(
@@ -178,13 +179,16 @@ class SentencePredictionTask(base_task.Task):
      return
    pretrain2finetune_mapping = {
-        'encoder':
+        'encoder': model.checkpoint_items['encoder'],
-            model.checkpoint_items['encoder'],
-        'next_sentence.pooler_dense':
-            model.checkpoint_items['sentence_prediction.pooler_dense'],
    }
+    # TODO(b/160251903): Investigate why no pooler dense improves finetuning
+    # accuracies.
+    if self.task_config.init_cls_pooler:
+      pretrain2finetune_mapping[
+          'next_sentence.pooler_dense'] = model.checkpoint_items[
+              'sentence_prediction.pooler_dense']
    ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping)
-    status = ckpt.restore(ckpt_dir_or_file)
+    status = ckpt.read(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
    logging.info('finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
--- a/official/nlp/tasks/tagging.py
+++ b/official/nlp/tasks/tagging.py
@@ -15,7 +15,12 @@
 # ==============================================================================
 """Tagging (e.g., NER/POS) task."""
 import logging
+from typing import List, Optional
 import dataclasses
+from seqeval import metrics as seqeval_metrics
 import tensorflow as tf
 import tensorflow_hub as hub
@@ -36,12 +41,12 @@ class TaggingConfig(cfg.TaskConfig):
  model: encoders.TransformerEncoderConfig = (
      encoders.TransformerEncoderConfig())
-  # The number of real labels. Note that a word may be tokenized into
+  # The real class names, the order of which should match real label id.
-  # multiple word_pieces tokens, and we asssume the real label id (non-negative)
+  # Note that a word may be tokenized into multiple word_pieces tokens, and
-  # is assigned to the first token of the word, and a negative label id is
+  # we asssume the real label id (non-negative) is assigned to the first token
-  # assigned to the remaining tokens. The negative label id will not contribute
+  # of the word, and a negative label id is assigned to the remaining tokens.
-  # to loss and metrics.
+  # The negative label id will not contribute to loss and metrics.
-  num_classes: int = 0
+  class_names: Optional[List[str]] = None
  train_data: cfg.DataConfig = cfg.DataConfig()
  validation_data: cfg.DataConfig = cfg.DataConfig()
@@ -70,13 +75,13 @@ def _masked_labels_and_weights(y_true):
 class TaggingTask(base_task.Task):
  """Task object for tagging (e.g., NER or POS)."""
-  def __init__(self, params=cfg.TaskConfig):
+  def __init__(self, params=cfg.TaskConfig, logging_dir=None):
-    super(TaggingTask, self).__init__(params)
+    super(TaggingTask, self).__init__(params, logging_dir)
    if params.hub_module_url and params.init_checkpoint:
      raise ValueError('At most one of `hub_module_url` and '
                       '`init_checkpoint` can be specified.')
-    if params.num_classes == 0:
+    if not params.class_names:
-      raise ValueError('TaggingConfig.num_classes cannot be 0.')
+      raise ValueError('TaggingConfig.class_names cannot be empty.')
    if params.hub_module_url:
      self._hub_module = hub.load(params.hub_module_url)
@@ -92,7 +97,7 @@ class TaggingTask(base_task.Task):
    return models.BertTokenClassifier(
        network=encoder_network,
-        num_classes=self.task_config.num_classes,
+        num_classes=len(self.task_config.class_names),
        initializer=tf.keras.initializers.TruncatedNormal(
            stddev=self.task_config.model.initializer_range),
        dropout_rate=self.task_config.model.dropout_rate,
@@ -123,7 +128,7 @@ class TaggingTask(base_task.Task):
        y = tf.random.uniform(
            shape=(1, params.seq_length),
            minval=-1,
-            maxval=self.task_config.num_classes,
+            maxval=len(self.task_config.class_names),
            dtype=tf.dtypes.int32)
        return (x, y)
@@ -136,19 +141,66 @@ class TaggingTask(base_task.Task):
    dataset = tagging_data_loader.TaggingDataLoader(params).load(input_context)
    return dataset
-  def build_metrics(self, training=None):
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    del training
+    """Validatation step.
-    # TODO(chendouble): evaluate using seqeval's f1/precision/recall.
-    return [tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')]
+    Args:
+      inputs: a dictionary of input tensors.
-  def process_metrics(self, metrics, labels, model_outputs):
+      model: the keras.Model.
-    masked_labels, masked_weights = _masked_labels_and_weights(labels)
+      metrics: a nested structure of metrics objects.
-    for metric in metrics:
-      metric.update_state(masked_labels, model_outputs, masked_weights)
+    Returns:
+      A dictionary of logs.
-  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    """
-    masked_labels, masked_weights = _masked_labels_and_weights(labels)
+    features, labels = inputs
-    compiled_metrics.update_state(masked_labels, model_outputs, masked_weights)
+    outputs = self.inference_step(features, model)
+    loss = self.build_losses(labels=labels, model_outputs=outputs)
+    # Negative label ids are padding labels which should be ignored.
+    real_label_index = tf.where(tf.greater_equal(labels, 0))
+    predict_ids = tf.math.argmax(outputs, axis=-1)
+    predict_ids = tf.gather_nd(predict_ids, real_label_index)
+    label_ids = tf.gather_nd(labels, real_label_index)
+    return {
+        self.loss: loss,
+        'predict_ids': predict_ids,
+        'label_ids': label_ids,
+    }
+  def aggregate_logs(self, state=None, step_outputs=None):
+    """Aggregates over logs returned from a validation step."""
+    if state is None:
+      state = {'predict_class': [], 'label_class': []}
+    def id_to_class_name(batched_ids):
+      class_names = []
+      for per_example_ids in batched_ids:
+        class_names.append([])
+        for per_token_id in per_example_ids.numpy().tolist():
+          class_names[-1].append(self.task_config.class_names[per_token_id])
+      return class_names
+    # Convert id to class names, because `seqeval_metrics` relies on the class
+    # name to decide IOB tags.
+    state['predict_class'].extend(id_to_class_name(step_outputs['predict_ids']))
+    state['label_class'].extend(id_to_class_name(step_outputs['label_ids']))
+    return state
+  def reduce_aggregated_logs(self, aggregated_logs):
+    """Reduces aggregated logs over validation steps."""
+    label_class = aggregated_logs['label_class']
+    predict_class = aggregated_logs['predict_class']
+    return {
+        'f1':
+            seqeval_metrics.f1_score(label_class, predict_class),
+        'precision':
+            seqeval_metrics.precision_score(label_class, predict_class),
+        'recall':
+            seqeval_metrics.recall_score(label_class, predict_class),
+        'accuracy':
+            seqeval_metrics.accuracy_score(label_class, predict_class),
+    }
  def initialize(self, model):
    """Load a pretrained checkpoint (if exists) and then train from iter 0."""

--- a/official/nlp/tasks/tagging_test.py
+++ b/official/nlp/tasks/tagging_test.py
@@ -58,7 +58,7 @@ class TaggingTest(tf.test.TestCase):
        init_checkpoint=saved_path,
        model=self._encoder_config,
        train_data=self._train_data_config,
-        num_classes=3)
+        class_names=["O", "B-PER", "I-PER"])
    task = tagging.TaggingTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
@@ -74,7 +74,7 @@ class TaggingTest(tf.test.TestCase):
    config = tagging.TaggingConfig(
        model=self._encoder_config,
        train_data=self._train_data_config,
-        num_classes=3)
+        class_names=["O", "B-PER", "I-PER"])
    task = tagging.TaggingTask(config)
    model = task.build_model()
@@ -116,10 +116,31 @@ class TaggingTest(tf.test.TestCase):
    config = tagging.TaggingConfig(
        hub_module_url=hub_module_url,
        model=self._encoder_config,
-        num_classes=4,
+        class_names=["O", "B-PER", "I-PER"],
        train_data=self._train_data_config)
    self._run_task(config)
+  def test_seqeval_metrics(self):
+    config = tagging.TaggingConfig(
+        model=self._encoder_config,
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    strategy = tf.distribute.get_strategy()
+    distributed_outputs = strategy.run(
+        functools.partial(task.validation_step, model=model),
+        args=(next(iterator),))
+    outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                    distributed_outputs)
+    aggregated = task.aggregate_logs(step_outputs=outputs)
+    aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
+    self.assertCountEqual({"f1", "precision", "recall", "accuracy"},
+                          task.reduce_aggregated_logs(aggregated).keys())
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/transformer/optimizer.py
+++ b/official/nlp/transformer/optimizer.py
@@ -18,9 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import numpy as np
 import tensorflow as tf
-K = tf.keras.backend
 class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
@@ -66,72 +64,3 @@ class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        'hidden_size': self.hidden_size,
        'warmup_steps': self.warmup_steps,
    }
-class LearningRateFn(object):
-  """Creates learning rate function."""
-  def __init__(self, learning_rate, hidden_size, warmup_steps):
-    self.learning_rate = learning_rate
-    self.hidden_size = hidden_size
-    self.warmup_steps = float(warmup_steps)
-  def __call__(self, global_step):
-    """Calculate learning rate with linear warmup and rsqrt decay."""
-    step = float(global_step)
-    learning_rate = self.learning_rate
-    learning_rate *= (self.hidden_size ** -0.5)
-    # Apply linear warmup
-    learning_rate *= np.minimum(1.0, step / self.warmup_steps)
-    # Apply rsqrt decay
-    learning_rate /= np.sqrt(np.maximum(step, self.warmup_steps))
-    return learning_rate
-class LearningRateScheduler(tf.keras.callbacks.Callback):
-  """Keras callback to schedule learning rate.
-  TODO(tianlin): Refactor this scheduler and LearningRateBatchScheduler in
-  official/resnet/keras/keras_common.py.
-  """
-  def __init__(self, schedule, init_steps=None, verbose=False):
-    super(LearningRateScheduler, self).__init__()
-    self.schedule = schedule
-    self.verbose = verbose
-    if init_steps is None:
-      init_steps = 0.0
-    self.steps = float(init_steps)   # Total steps during training.
-  def on_epoch_begin(self, epoch, logs=None):
-    if not hasattr(self.model.optimizer, 'lr'):
-      raise ValueError('Optimizer must have a "lr" attribute.')
-    if not hasattr(self.model.optimizer, 'iterations'):
-      raise ValueError('Optimizer must have a "iterations" attribute.')
-  def on_train_batch_begin(self, batch, logs=None):
-    """Adjusts learning rate for each train batch."""
-    if self.verbose > 0:
-      iterations = K.get_value(self.model.optimizer.iterations)
-      print('Original iteration %d' % iterations)
-    self.steps += 1.0
-    try:  # new API
-      lr = float(K.get_value(self.model.optimizer.lr))
-      lr = self.schedule(self.steps, lr)
-    except TypeError:  # Support for old API for backward compatibility
-      lr = self.schedule(self.steps)
-    if not isinstance(lr, (float, np.float32, np.float64)):
-      raise ValueError('The output of the "schedule" function '
-                       'should be float.')
-    K.set_value(self.model.optimizer.lr, lr)
-    K.set_value(self.model.optimizer.iterations, self.steps)
-    if self.verbose > 0:
-      print('Batch %05d Step %05d: LearningRateScheduler setting learning '
-            'rate to %s.' % (batch + 1, self.steps, lr))
-  def on_epoch_end(self, epoch, logs=None):
-    logs = logs or {}
-    logs['lr'] = K.get_value(self.model.optimizer.lr)
-    logs['steps'] = self.steps
--- a/official/nlp/transformer/transformer_main.py
+++ b/official/nlp/transformer/transformer_main.py
@@ -241,7 +241,7 @@ class TransformerTask(object):
    if params["use_ctl"]:
      train_ds_iterator = iter(train_ds)
-    callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)
+    callbacks = self._create_callbacks(flags_obj.model_dir, params)
    # Only TimeHistory callback is supported for CTL
    if params["use_ctl"]:
@@ -408,14 +408,9 @@ class TransformerTask(object):
    for i in range(length):
      translate.translate_from_input(val_outputs[i], subtokenizer)
-  def _create_callbacks(self, cur_log_dir, init_steps, params):
+  def _create_callbacks(self, cur_log_dir, params):
    """Creates a list of callbacks."""
-    sfunc = optimizer.LearningRateFn(params["learning_rate"],
-                                     params["hidden_size"],
-                                     params["learning_rate_warmup_steps"])
-    scheduler_callback = optimizer.LearningRateScheduler(sfunc, init_steps)
    callbacks = misc.get_callbacks()
-    callbacks.append(scheduler_callback)
    if params["enable_checkpointing"]:
      ckpt_full_path = os.path.join(cur_log_dir, "cp-{epoch:04d}.ckpt")
      callbacks.append(
@@ -445,7 +440,7 @@ class TransformerTask(object):
        params["learning_rate"], params["hidden_size"],
        params["learning_rate_warmup_steps"])
    opt = tf.keras.optimizers.Adam(
-        lr_schedule if self.use_tpu else params["learning_rate"],
+        lr_schedule,
        params["optimizer_adam_beta1"],
        params["optimizer_adam_beta2"],
        epsilon=params["optimizer_adam_epsilon"])

--- a/official/nlp/transformer/translate.py
+++ b/official/nlp/transformer/translate.py
@@ -181,7 +181,7 @@ def translate_file(model,
      raise ValueError("File output is a directory, will not save outputs to "
                       "file.")
    logging.info("Writing to file %s", output_file)
-    with tf.compat.v1.gfile.Open(output_file, "w") as f:
+    with tf.io.gfile.GFile(output_file, "w") as f:
      for i in sorted_keys:
        f.write("%s\n" % translations[i])

--- a/official/pip_package/setup.py
+++ b/official/pip_package/setup.py
@@ -45,6 +45,9 @@ def _get_requirements():
      os.path.join(os.path.dirname(__file__), '../requirements.txt'), 'r') as f:
    for line in f:
      package_name = line.strip()
+      # Skip empty line or comments starting with "#".
+      if not package_name or package_name[0] == '#':
+        continue
      if package_name.startswith('-e '):
        dependency_links_tmp.append(package_name[3:].strip())
      else:

--- a/official/requirements.txt
+++ b/official/requirements.txt
@@ -16,10 +16,13 @@ dataclasses
 gin-config
 tf_slim>=1.1.0
 typing
-sentencepiece
 Cython
 matplotlib
-opencv-python-headless
 pyyaml
+# CV related dependencies
+opencv-python-headless
 Pillow
 -e git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI
+# NLP related dependencies
+seqeval
+sentencepiece
--- a/official/vision/detection/README.md
+++ b/official/vision/detection/README.md
@@ -48,6 +48,22 @@ so the checkpoints are not compatible.
 We will unify the implementation soon.
+### Train a SpineNet-49 based RetinaNet.
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --params_override="{ type: retinanet, architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
 ### Train a custom RetinaNet using the config file.
@@ -163,6 +179,24 @@ so the checkpoints are not compatible.
 We will unify the implementation soon.
+### Train a SpineNet-49 based Mask R-CNN.
+```bash
+TPU_NAME="<your GCP TPU name>"
+MODEL_DIR="<path to the directory to store model files>"
+TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
+EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
+VAL_JSON_FILE="<path to the validation annotation JSON file>"
+python3 ~/models/official/vision/detection/main.py \
+  --strategy_type=tpu \
+  --tpu="${TPU_NAME?}" \
+  --model_dir="${MODEL_DIR?}" \
+  --mode=train \
+  --model=mask_rcnn \
+  --params_override="{architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
+```
 ### Train a custom Mask R-CNN using the config file.
 First, create a YAML config file, e.g. *my_maskrcnn.yaml*.

--- a/official/vision/detection/configs/base_config.py
+++ b/official/vision/detection/configs/base_config.py
@@ -17,10 +17,12 @@
 BACKBONES = [
    'resnet',
+    'spinenet',
 ]
 MULTILEVEL_FEATURES = [
    'fpn',
+    'identity',
 ]
 # pylint: disable=line-too-long
@@ -118,6 +120,9 @@ BASE_CFG = {
    'resnet': {
        'resnet_depth': 50,
    },
+    'spinenet': {
+        'model_id': '49',
+    },
    'fpn': {
        'fpn_feat_dims': 256,
        'use_separable_conv': False,

--- a/official/vision/detection/modeling/architecture/factory.py
+++ b/official/vision/detection/modeling/architecture/factory.py
@@ -23,6 +23,7 @@ from official.vision.detection.modeling.architecture import heads
 from official.vision.detection.modeling.architecture import identity
 from official.vision.detection.modeling.architecture import nn_ops
 from official.vision.detection.modeling.architecture import resnet
+from official.vision.detection.modeling.architecture import spinenet
 def norm_activation_generator(params):
@@ -42,6 +43,9 @@ def backbone_generator(params):
        activation=params.norm_activation.activation,
        norm_activation=norm_activation_generator(
            params.norm_activation))
+  elif params.architecture.backbone == 'spinenet':
+    spinenet_params = params.spinenet
+    backbone_fn = spinenet.SpineNetBuilder(model_id=spinenet_params.model_id)
  else:
    raise ValueError('Backbone model `{}` is not supported.'
                     .format(params.architecture.backbone))

--- a/official/vision/detection/modeling/architecture/nn_blocks.py
+++ b/official/vision/detection/modeling/architecture/nn_blocks.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains common building blocks for neural networks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from official.modeling import tf_utils
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class ResidualBlock(tf.keras.layers.Layer):
+  """A residual block."""
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """A residual block with BN after convolutions.
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(ResidualBlock, self).__init__(**kwargs)
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    super(ResidualBlock, self).build(input_shape)
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(ResidualBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+    x = self._conv2(x)
+    x = self._norm2(x)
+    return self._activation_fn(x + shortcut)
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class BottleneckBlock(tf.keras.layers.Layer):
+  """A standard bottleneck block."""
+  def __init__(self,
+               filters,
+               strides,
+               use_projection=False,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """A standard bottleneck block with BN after convolutions.
+    Args:
+      filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+      strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+      use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+        Default to None.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+        Default to None.
+      activation: `str` name of the activation function.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float` normalization omentum for the moving average.
+      norm_epsilon: `float` small float added to variance to avoid dividing by
+        zero.
+      **kwargs: keyword arguments to be passed.
+    """
+    super(BottleneckBlock, self).__init__(**kwargs)
+    self._filters = filters
+    self._strides = strides
+    self._use_projection = use_projection
+    self._use_sync_bn = use_sync_bn
+    self._activation = activation
+    self._kernel_initializer = kernel_initializer
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    if use_sync_bn:
+      self._norm = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = tf.keras.layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    self._activation_fn = tf_utils.get_activation(activation)
+  def build(self, input_shape):
+    if self._use_projection:
+      self._shortcut = tf.keras.layers.Conv2D(
+          filters=self._filters * 4,
+          kernel_size=1,
+          strides=self._strides,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)
+      self._norm0 = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)
+    self._conv1 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm1 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._conv2 = tf.keras.layers.Conv2D(
+        filters=self._filters,
+        kernel_size=3,
+        strides=self._strides,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm2 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    self._conv3 = tf.keras.layers.Conv2D(
+        filters=self._filters * 4,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+    self._norm3 = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)
+    super(BottleneckBlock, self).build(input_shape)
+  def get_config(self):
+    config = {
+        'filters': self._filters,
+        'strides': self._strides,
+        'use_projection': self._use_projection,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'activation': self._activation,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon
+    }
+    base_config = super(BottleneckBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    shortcut = inputs
+    if self._use_projection:
+      shortcut = self._shortcut(shortcut)
+      shortcut = self._norm0(shortcut)
+    x = self._conv1(inputs)
+    x = self._norm1(x)
+    x = self._activation_fn(x)
+    x = self._conv2(x)
+    x = self._norm2(x)
+    x = self._activation_fn(x)
+    x = self._conv3(x)
+    x = self._norm3(x)
+    return self._activation_fn(x + shortcut)
--- a/official/vision/detection/modeling/architecture/spinenet.py
+++ b/official/vision/detection/modeling/architecture/spinenet.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of SpineNet model.
+X. Du, T-Y. Lin, P. Jin, G. Ghiasi, M. Tan, Y. Cui, Q. V. Le, X. Song
+SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization
+https://arxiv.org/abs/1912.05027
+"""
+import math
+from absl import logging
+import tensorflow as tf
+from tensorflow.python.keras import backend
+from official.modeling import tf_utils
+from official.vision.detection.modeling.architecture import nn_blocks
+layers = tf.keras.layers
+FILTER_SIZE_MAP = {
+    1: 32,
+    2: 64,
+    3: 128,
+    4: 256,
+    5: 256,
+    6: 256,
+    7: 256,
+}
+# The fixed SpineNet architecture discovered by NAS.
+# Each element represents a specification of a building block:
+#   (block_level, block_fn, (input_offset0, input_offset1), is_output).
+SPINENET_BLOCK_SPECS = [
+    (2, 'bottleneck', (0, 1), False),
+    (4, 'residual', (0, 1), False),
+    (3, 'bottleneck', (2, 3), False),
+    (4, 'bottleneck', (2, 4), False),
+    (6, 'residual', (3, 5), False),
+    (4, 'bottleneck', (3, 5), False),
+    (5, 'residual', (6, 7), False),
+    (7, 'residual', (6, 8), False),
+    (5, 'bottleneck', (8, 9), False),
+    (5, 'bottleneck', (8, 10), False),
+    (4, 'bottleneck', (5, 10), True),
+    (3, 'bottleneck', (4, 10), True),
+    (5, 'bottleneck', (7, 12), True),
+    (7, 'bottleneck', (5, 14), True),
+    (6, 'bottleneck', (12, 14), True),
+]
+SCALING_MAP = {
+    '49S': {
+        'endpoints_num_filters': 128,
+        'filter_size_scale': 0.65,
+        'resample_alpha': 0.5,
+        'block_repeats': 1,
+    },
+    '49': {
+        'endpoints_num_filters': 256,
+        'filter_size_scale': 1.0,
+        'resample_alpha': 0.5,
+        'block_repeats': 1,
+    },
+    '96': {
+        'endpoints_num_filters': 256,
+        'filter_size_scale': 1.0,
+        'resample_alpha': 0.5,
+        'block_repeats': 2,
+    },
+    '143': {
+        'endpoints_num_filters': 256,
+        'filter_size_scale': 1.0,
+        'resample_alpha': 1.0,
+        'block_repeats': 3,
+    },
+    '190': {
+        'endpoints_num_filters': 512,
+        'filter_size_scale': 1.3,
+        'resample_alpha': 1.0,
+        'block_repeats': 4,
+    },
+}
+class BlockSpec(object):
+  """A container class that specifies the block configuration for SpineNet."""
+  def __init__(self, level, block_fn, input_offsets, is_output):
+    self.level = level
+    self.block_fn = block_fn
+    self.input_offsets = input_offsets
+    self.is_output = is_output
+def build_block_specs(block_specs=None):
+  """Builds the list of BlockSpec objects for SpineNet."""
+  if not block_specs:
+    block_specs = SPINENET_BLOCK_SPECS
+  logging.info('Building SpineNet block specs: %s', block_specs)
+  return [BlockSpec(*b) for b in block_specs]
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class SpineNet(tf.keras.Model):
+  """Class to build SpineNet models."""
+  def __init__(self,
+               input_specs=tf.keras.layers.InputSpec(shape=[None, 640, 640, 3]),
+               min_level=3,
+               max_level=7,
+               block_specs=build_block_specs(),
+               endpoints_num_filters=256,
+               resample_alpha=0.5,
+               block_repeats=1,
+               filter_size_scale=1.0,
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               **kwargs):
+    """SpineNet model."""
+    self._min_level = min_level
+    self._max_level = max_level
+    self._block_specs = block_specs
+    self._endpoints_num_filters = endpoints_num_filters
+    self._resample_alpha = resample_alpha
+    self._block_repeats = block_repeats
+    self._filter_size_scale = filter_size_scale
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    if activation == 'relu':
+      self._activation = tf.nn.relu
+    elif activation == 'swish':
+      self._activation = tf.nn.swish
+    else:
+      raise ValueError('Activation {} not implemented.'.format(activation))
+    self._init_block_fn = 'bottleneck'
+    self._num_init_blocks = 2
+    if use_sync_bn:
+      self._norm = layers.experimental.SyncBatchNormalization
+    else:
+      self._norm = layers.BatchNormalization
+    if tf.keras.backend.image_data_format() == 'channels_last':
+      self._bn_axis = -1
+    else:
+      self._bn_axis = 1
+    # Build SpineNet.
+    inputs = tf.keras.Input(shape=input_specs.shape[1:])
+    net = self._build_stem(inputs=inputs)
+    net = self._build_scale_permuted_network(
+        net=net, input_width=input_specs.shape[1])
+    net = self._build_endpoints(net=net)
+    super(SpineNet, self).__init__(inputs=inputs, outputs=net)
+  def _block_group(self,
+                   inputs,
+                   filters,
+                   strides,
+                   block_fn_cand,
+                   block_repeats=1,
+                   name='block_group'):
+    """Creates one group of blocks for the SpineNet model."""
+    block_fn_candidates = {
+        'bottleneck': nn_blocks.BottleneckBlock,
+        'residual': nn_blocks.ResidualBlock,
+    }
+    block_fn = block_fn_candidates[block_fn_cand]
+    _, _, _, num_filters = inputs.get_shape().as_list()
+    if block_fn_cand == 'bottleneck':
+      use_projection = not (num_filters == (filters * 4) and strides == 1)
+    else:
+      use_projection = not (num_filters == filters and strides == 1)
+    x = block_fn(
+        filters=filters,
+        strides=strides,
+        use_projection=use_projection,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        norm_momentum=self._norm_momentum,
+        norm_epsilon=self._norm_epsilon)(
+            inputs)
+    for _ in range(1, block_repeats):
+      x = block_fn(
+          filters=filters,
+          strides=1,
+          use_projection=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          activation=self._activation,
+          use_sync_bn=self._use_sync_bn,
+          norm_momentum=self._norm_momentum,
+          norm_epsilon=self._norm_epsilon)(
+              x)
+    return tf.identity(x, name=name)
+  def _build_stem(self, inputs):
+    """Build SpineNet stem."""
+    x = layers.Conv2D(
+        filters=64,
+        kernel_size=7,
+        strides=2,
+        use_bias=False,
+        padding='same',
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            inputs)
+    x = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)(
+            x)
+    x = tf_utils.get_activation(self._activation)(x)
+    x = layers.MaxPool2D(pool_size=3, strides=2, padding='same')(x)
+    net = []
+    # Build the initial level 2 blocks.
+    for i in range(self._num_init_blocks):
+      x = self._block_group(
+          inputs=x,
+          filters=int(FILTER_SIZE_MAP[2] * self._filter_size_scale),
+          strides=1,
+          block_fn_cand=self._init_block_fn,
+          block_repeats=self._block_repeats,
+          name='stem_block_{}'.format(i + 1))
+      net.append(x)
+    return net
+  def _build_scale_permuted_network(self,
+                                    net,
+                                    input_width,
+                                    weighted_fusion=False):
+    """Build scale-permuted network."""
+    net_sizes = [int(math.ceil(input_width / 2**2))] * len(net)
+    net_block_fns = [self._init_block_fn] * len(net)
+    num_outgoing_connections = [0] * len(net)
+    endpoints = {}
+    for i, block_spec in enumerate(self._block_specs):
+      # Find out specs for the target block.
+      target_width = int(math.ceil(input_width / 2**block_spec.level))
+      target_num_filters = int(FILTER_SIZE_MAP[block_spec.level] *
+                               self._filter_size_scale)
+      target_block_fn = block_spec.block_fn
+      # Resample then merge input0 and input1.
+      parents = []
+      input0 = block_spec.input_offsets[0]
+      input1 = block_spec.input_offsets[1]
+      x0 = self._resample_with_alpha(
+          inputs=net[input0],
+          input_width=net_sizes[input0],
+          input_block_fn=net_block_fns[input0],
+          target_width=target_width,
+          target_num_filters=target_num_filters,
+          target_block_fn=target_block_fn,
+          alpha=self._resample_alpha)
+      parents.append(x0)
+      num_outgoing_connections[input0] += 1
+      x1 = self._resample_with_alpha(
+          inputs=net[input1],
+          input_width=net_sizes[input1],
+          input_block_fn=net_block_fns[input1],
+          target_width=target_width,
+          target_num_filters=target_num_filters,
+          target_block_fn=target_block_fn,
+          alpha=self._resample_alpha)
+      parents.append(x1)
+      num_outgoing_connections[input1] += 1
+      # Merge 0 outdegree blocks to the output block.
+      if block_spec.is_output:
+        for j, (j_feat,
+                j_connections) in enumerate(zip(net, num_outgoing_connections)):
+          if j_connections == 0 and (j_feat.shape[2] == target_width and
+                                     j_feat.shape[3] == x0.shape[3]):
+            parents.append(j_feat)
+            num_outgoing_connections[j] += 1
+      # pylint: disable=g-direct-tensorflow-import
+      if weighted_fusion:
+        dtype = parents[0].dtype
+        parent_weights = [
+            tf.nn.relu(tf.cast(tf.Variable(1.0, name='block{}_fusion{}'.format(
+                i, j)), dtype=dtype)) for j in range(len(parents))]
+        weights_sum = tf.add_n(parent_weights)
+        parents = [
+            parents[i] * parent_weights[i] / (weights_sum + 0.0001)
+            for i in range(len(parents))
+        ]
+      # Fuse all parent nodes then build a new block.
+      x = tf_utils.get_activation(self._activation)(tf.add_n(parents))
+      x = self._block_group(
+          inputs=x,
+          filters=target_num_filters,
+          strides=1,
+          block_fn_cand=target_block_fn,
+          block_repeats=self._block_repeats,
+          name='scale_permuted_block_{}'.format(i + 1))
+      net.append(x)
+      net_sizes.append(target_width)
+      net_block_fns.append(target_block_fn)
+      num_outgoing_connections.append(0)
+      # Save output feats.
+      if block_spec.is_output:
+        if block_spec.level in endpoints:
+          raise ValueError('Duplicate feats found for output level {}.'.format(
+              block_spec.level))
+        if (block_spec.level < self._min_level or
+            block_spec.level > self._max_level):
+          raise ValueError('Output level is out of range [{}, {}]'.format(
+              self._min_level, self._max_level))
+        endpoints[block_spec.level] = x
+    return endpoints
+  def _build_endpoints(self, net):
+    """Match filter size for endpoints before sharing conv layers."""
+    endpoints = {}
+    for level in range(self._min_level, self._max_level + 1):
+      x = layers.Conv2D(
+          filters=self._endpoints_num_filters,
+          kernel_size=1,
+          strides=1,
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)(
+              net[level])
+      x = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)(
+              x)
+      x = tf_utils.get_activation(self._activation)(x)
+      endpoints[level] = x
+    return endpoints
+  def _resample_with_alpha(self,
+                           inputs,
+                           input_width,
+                           input_block_fn,
+                           target_width,
+                           target_num_filters,
+                           target_block_fn,
+                           alpha=0.5):
+    """Match resolution and feature dimension."""
+    _, _, _, input_num_filters = inputs.get_shape().as_list()
+    if input_block_fn == 'bottleneck':
+      input_num_filters /= 4
+    new_num_filters = int(input_num_filters * alpha)
+    x = layers.Conv2D(
+        filters=new_num_filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            inputs)
+    x = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)(
+            x)
+    x = tf_utils.get_activation(self._activation)(x)
+    # Spatial resampling.
+    if input_width > target_width:
+      x = layers.Conv2D(
+          filters=new_num_filters,
+          kernel_size=3,
+          strides=2,
+          padding='SAME',
+          use_bias=False,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer)(
+              x)
+      x = self._norm(
+          axis=self._bn_axis,
+          momentum=self._norm_momentum,
+          epsilon=self._norm_epsilon)(
+              x)
+      x = tf_utils.get_activation(self._activation)(x)
+      input_width /= 2
+      while input_width > target_width:
+        x = layers.MaxPool2D(pool_size=3, strides=2, padding='SAME')(x)
+        input_width /= 2
+    elif input_width < target_width:
+      scale = target_width // input_width
+      x = layers.UpSampling2D(size=(scale, scale))(x)
+    # Last 1x1 conv to match filter size.
+    if target_block_fn == 'bottleneck':
+      target_num_filters *= 4
+    x = layers.Conv2D(
+        filters=target_num_filters,
+        kernel_size=1,
+        strides=1,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)(
+            x)
+    x = self._norm(
+        axis=self._bn_axis,
+        momentum=self._norm_momentum,
+        epsilon=self._norm_epsilon)(
+            x)
+    return x
+class SpineNetBuilder(object):
+  """SpineNet builder."""
+  def __init__(self,
+               model_id,
+               input_specs=tf.keras.layers.InputSpec(shape=[None, 640, 640, 3]),
+               min_level=3,
+               max_level=7,
+               block_specs=build_block_specs(),
+               kernel_initializer='VarianceScaling',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation='relu',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001):
+    if model_id not in SCALING_MAP:
+      raise ValueError(
+          'SpineNet {} is not a valid architecture.'.format(model_id))
+    scaling_params = SCALING_MAP[model_id]
+    self._input_specs = input_specs
+    self._min_level = min_level
+    self._max_level = max_level
+    self._block_specs = block_specs
+    self._endpoints_num_filters = scaling_params['endpoints_num_filters']
+    self._resample_alpha = scaling_params['resample_alpha']
+    self._block_repeats = scaling_params['block_repeats']
+    self._filter_size_scale = scaling_params['filter_size_scale']
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+  def __call__(self, inputs, is_training=None):
+    with backend.get_graph().as_default():
+      model = SpineNet(
+          input_specs=self._input_specs,
+          min_level=self._min_level,
+          max_level=self._max_level,
+          block_specs=self._block_specs,
+          endpoints_num_filters=self._endpoints_num_filters,
+          resample_alpha=self._resample_alpha,
+          block_repeats=self._block_repeats,
+          filter_size_scale=self._filter_size_scale,
+          kernel_initializer=self._kernel_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          activation=self._activation,
+          use_sync_bn=self._use_sync_bn,
+          norm_momentum=self._norm_momentum,
+          norm_epsilon=self._norm_epsilon)
+      return model(inputs)
--- a/official/vision/image_classification/classifier_trainer.py
+++ b/official/vision/image_classification/classifier_trainer.py
@@ -339,7 +339,8 @@ def train_and_eval(
    optimizer = optimizer_factory.build_optimizer(
        optimizer_name=params.model.optimizer.name,
        base_learning_rate=learning_rate,
-        params=params.model.optimizer.as_dict())
+        params=params.model.optimizer.as_dict(),
+        model=model)
    metrics_map = _get_metrics(one_hot)
    metrics = [metrics_map[metric] for metric in params.train.metrics]

--- a/official/vision/image_classification/optimizer_factory.py
+++ b/official/vision/image_classification/optimizer_factory.py
@@ -18,11 +18,12 @@ from __future__ import division
 # from __future__ import google_type_annotations
 from __future__ import print_function
+from typing import Any, Dict, Text, List
 from absl import logging
 import tensorflow as tf
 import tensorflow_addons as tfa
-from typing import Any, Dict, Text, List
 from official.vision.image_classification import learning_rate
 from official.vision.image_classification.configs import base_configs
@@ -250,7 +251,8 @@ class MovingAverage(tf.keras.optimizers.Optimizer):
 def build_optimizer(
    optimizer_name: Text,
    base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule,
-    params: Dict[Text, Any]):
+    params: Dict[Text, Any],
+    model: tf.keras.Model = None):
  """Build the optimizer based on name.
  Args:
@@ -261,6 +263,8 @@ def build_optimizer(
    params: String -> Any dictionary representing the optimizer params.
      This should contain optimizer specific parameters such as
      `base_learning_rate`, `decay`, etc.
+    model: The `tf.keras.Model`. This is used for the shadow copy if using
+      `MovingAverage`.
  Returns:
    A tf.keras.Optimizer.
@@ -322,10 +326,13 @@ def build_optimizer(
  # Moving average should be applied last, as it's applied at test time
  moving_average_decay = params.get('moving_average_decay', 0.)
  if moving_average_decay is not None and moving_average_decay > 0.:
+    if model is None:
+      raise ValueError('`model` must be provided if using `MovingAverage`.')
    logging.info('Including moving average decay.')
    optimizer = MovingAverage(
-        optimizer,
+        optimizer=optimizer,
        average_decay=moving_average_decay)
+    optimizer.shadow_copy(model)
  return optimizer

--- a/official/vision/image_classification/optimizer_factory_test.py
+++ b/official/vision/image_classification/optimizer_factory_test.py
@@ -19,15 +19,21 @@ from __future__ import division
 # from __future__ import google_type_annotations
 from __future__ import print_function
-import tensorflow as tf
 from absl.testing import parameterized
+import tensorflow as tf
 from official.vision.image_classification import optimizer_factory
 from official.vision.image_classification.configs import base_configs
 class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
+  def build_toy_model(self) -> tf.keras.Model:
+    """Creates a toy `tf.Keras.Model`."""
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(1, input_shape=(1,)))
+    return model
  @parameterized.named_parameters(
      ('sgd', 'sgd', 0., False),
      ('momentum', 'momentum', 0., False),
@@ -40,6 +46,7 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
      ('rmsprop_ema', 'rmsprop', 0.999, False))
  def test_optimizer(self, optimizer_name, moving_average_decay, lookahead):
    """Smoke test to be sure no syntax errors."""
+    model = self.build_toy_model()
    params = {
        'learning_rate': 0.001,
        'rho': 0.09,
@@ -51,7 +58,8 @@ class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
    optimizer = optimizer_factory.build_optimizer(
        optimizer_name=optimizer_name,
        base_learning_rate=params['learning_rate'],
-        params=params)
+        params=params,
+        model=model)
    self.assertTrue(issubclass(type(optimizer), tf.keras.optimizers.Optimizer))
  def test_unknown_optimizer(self):

--- a/research/delf/delf/protos/delf_config.proto
+++ b/research/delf/delf/protos/delf_config.proto
@@ -86,6 +86,9 @@ message DelfConfig {
  // Path to DELF model.
  optional string model_path = 1;  // Required.
+  // Whether model has been exported using TF version 2+.
+  optional bool is_tf2_exported = 10 [default = false];
  // Image scales to be used.
  repeated float image_scales = 2;

--- a/research/delf/delf/python/detect_to_retrieve/cluster_delf_features.py
+++ b/research/delf/delf/python/detect_to_retrieve/cluster_delf_features.py
@@ -131,7 +131,7 @@ def main(argv):
      delf_dataset = tf.data.Dataset.from_tensor_slices((features_placeholder))
      delf_dataset = delf_dataset.shuffle(1000).batch(
          features_for_clustering.shape[0])
-      iterator = delf_dataset.make_initializable_iterator()
+      iterator = tf.compat.v1.data.make_initializable_iterator(delf_dataset)
      def _initializer_fn(sess):
        """Initialize dataset iterator, feed in the data."""

--- a/research/delf/delf/python/examples/extractor.py
+++ b/research/delf/delf/python/examples/extractor.py
@@ -102,7 +102,15 @@ def MakeExtractor(config):
  Returns:
    Function that receives an image and returns features.
+  Raises:
+    ValueError: if config is invalid.
  """
+  # Assert the configuration
+  if config.use_global_features and hasattr(
+      config, 'is_tf2_exported') and config.is_tf2_exported:
+    raise ValueError('use_global_features is incompatible with is_tf2_exported')
  # Load model.
  model = tf.saved_model.load(config.model_path)
@@ -178,7 +186,8 @@ def MakeExtractor(config):
      else:
        global_pca_parameters['variances'] = None
-  model = model.prune(feeds=feeds, fetches=fetches)
+  if not hasattr(config, 'is_tf2_exported') or not config.is_tf2_exported:
+    model = model.prune(feeds=feeds, fetches=fetches)
  def ExtractorFn(image, resize_factor=1.0):
    """Receives an image and returns DELF global and/or local features.
@@ -197,7 +206,6 @@ def MakeExtractor(config):
        features (key 'local_features' mapping to a dict with keys 'locations',
        'descriptors', 'scales', 'attention').
    """
    resized_image, scale_factors = ResizeImage(
        image, config, resize_factor=resize_factor)
@@ -224,8 +232,20 @@ def MakeExtractor(config):
    output = None
    if config.use_local_features:
-      output = model(image_tensor, image_scales_tensor, score_threshold_tensor,
+      if hasattr(config, 'is_tf2_exported') and config.is_tf2_exported:
-                     max_feature_num_tensor)
+        predict = model.signatures['serving_default']
+        output_dict = predict(
+            input_image=image_tensor,
+            input_scales=image_scales_tensor,
+            input_max_feature_num=max_feature_num_tensor,
+            input_abs_thres=score_threshold_tensor)
+        output = [
+            output_dict['boxes'], output_dict['features'],
+            output_dict['scales'], output_dict['scores']
+        ]
+      else:
+        output = model(image_tensor, image_scales_tensor,
+                       score_threshold_tensor, max_feature_num_tensor)
    else:
      output = model(image_tensor, image_scales_tensor)

--- a/research/delf/delf/python/feature_aggregation_extractor.py
+++ b/research/delf/delf/python/feature_aggregation_extractor.py
@@ -269,8 +269,7 @@ class ExtractAggregatedRepresentation(object):
                           axis=0), [num_assignments, 1]) - tf.gather(
                               codebook, selected_visual_words[ind])
        return ind + 1, tf.tensor_scatter_nd_add(
-            vlad, tf.expand_dims(selected_visual_words[ind], axis=1),
+            vlad, tf.expand_dims(selected_visual_words[ind], axis=1), diff)
-            tf.cast(diff, dtype=tf.float32))
      ind_vlad = tf.constant(0, dtype=tf.int32)
      keep_going = lambda j, vlad: tf.less(j, num_features)
@@ -396,9 +395,7 @@ class ExtractAggregatedRepresentation(object):
    visual_words = tf.reshape(
        tf.where(
-            tf.greater(
+            tf.greater(per_centroid_norms, tf.sqrt(_NORM_SQUARED_TOLERANCE))),
-                per_centroid_norms,
-                tf.cast(tf.sqrt(_NORM_SQUARED_TOLERANCE), dtype=tf.float32))),
        [-1])
    per_centroid_normalized_vector = tf.math.l2_normalize(