Merge remote-tracking branch 'upstream/master' into detr-push-3

356c98bd · Kaushik Shivakumar · d31aba8a · b9785623 · 356c98bd · 356c98bd
Commit 356c98bd authored Aug 07, 2020 by Kaushik Shivakumar
20 changed files
--- a/official/nlp/tasks/question_answering.py
+++ b/official/nlp/tasks/question_answering.py
@@ -17,12 +17,15 @@
 import collections
 import json
 import os
+
 from absl import logging
 import dataclasses
+import orbit
 import tensorflow as tf
 import tensorflow_hub as hub

 from official.core import base_task
+from official.core import task_factory
 from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.bert import squad_evaluate_v1_1
@@ -39,8 +42,7 @@ from official.nlp.tasks import utils
 @dataclasses.dataclass
 class ModelConfig(base_config.Config):
  """A base span labeler configuration."""
-  encoder: encoders.TransformerEncoderConfig = (
-      encoders.TransformerEncoderConfig())
+  encoder: encoders.EncoderConfig = encoders.EncoderConfig()


 @dataclasses.dataclass
@@ -57,7 +59,7 @@ class QuestionAnsweringConfig(cfg.TaskConfig):
  validation_data: cfg.DataConfig = cfg.DataConfig()


-@base_task.register_task_cls(QuestionAnsweringConfig)
+@task_factory.register_task_cls(QuestionAnsweringConfig)
 class QuestionAnsweringTask(base_task.Task):
  """Task object for question answering."""

@@ -83,17 +85,21 @@ class QuestionAnsweringTask(base_task.Task):
      self._tf_record_input_path, self._eval_examples, self._eval_features = (
          self._preprocess_eval_data(params.validation_data))

+  def set_preprocessed_eval_input_path(self, eval_input_path):
+    """Sets the path to the preprocessed eval data."""
+    self._tf_record_input_path = eval_input_path
+
  def build_model(self):
    if self._hub_module:
      encoder_network = utils.get_encoder_from_hub(self._hub_module)
    else:
-      encoder_network = encoders.instantiate_encoder_from_cfg(
-          self.task_config.model.encoder)
+      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
+    encoder_cfg = self.task_config.model.encoder.get()
    # Currently, we only supports bert-style question answering finetuning.
    return models.BertSpanLabeler(
        network=encoder_network,
        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=self.task_config.model.encoder.initializer_range))
+            stddev=encoder_cfg.initializer_range))

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
    start_positions = labels['start_positions']
@@ -241,10 +247,6 @@ class QuestionAnsweringTask(base_task.Task):
        step_outputs['end_logits']):
      u_ids, s_logits, e_logits = (
          unique_ids.numpy(), start_logits.numpy(), end_logits.numpy())
-      if u_ids.size == 1:
-        u_ids = [u_ids]
-        s_logits = [s_logits]
-        e_logits = [e_logits]
      for values in zip(u_ids, s_logits, e_logits):
        state.append(self.raw_aggregated_result(
            unique_id=values[0],
@@ -291,16 +293,45 @@ class QuestionAnsweringTask(base_task.Task):
                      'final_f1': eval_metrics['final_f1']}
    return eval_metrics

-  def initialize(self, model):
-    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
-    ckpt_dir_or_file = self.task_config.init_checkpoint
-    if tf.io.gfile.isdir(ckpt_dir_or_file):
-      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
-    if not ckpt_dir_or_file:
-      return
-
-    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
-    status = ckpt.read(ckpt_dir_or_file)
-    status.expect_partial().assert_existing_objects_matched()
-    logging.info('Finished loading pretrained checkpoint from %s',
-                 ckpt_dir_or_file)
+
+def predict(task: QuestionAnsweringTask, params: cfg.DataConfig,
+            model: tf.keras.Model):
+  """Predicts on the input data.
+
+  Args:
+    task: A `QuestionAnsweringTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+
+  Returns:
+    A tuple of `all_predictions`, `all_nbest` and `scores_diff`, which
+      are dict and can be written to json files including prediction json file,
+      nbest json file and null_odds json file.
+  """
+  tf_record_input_path, eval_examples, eval_features = (
+      task._preprocess_eval_data(params))  # pylint: disable=protected-access
+
+  # `tf_record_input_path` will overwrite `params.input_path`,
+  # when `task.buid_inputs()` is called.
+  task.set_preprocessed_eval_input_path(tf_record_input_path)
+
+  def predict_step(inputs):
+    """Replicated prediction calculation."""
+    return task.validation_step(inputs, model)
+
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  aggregated_outputs = utils.predict(predict_step, task.aggregate_logs, dataset)
+
+  all_predictions, all_nbest, scores_diff = (
+      task.squad_lib.postprocess_output(
+          eval_examples,
+          eval_features,
+          aggregated_outputs,
+          task.task_config.n_best_size,
+          task.task_config.max_answer_length,
+          task.task_config.validation_data.do_lower_case,
+          version_2_with_negative=(params.version_2_with_negative),
+          null_score_diff_threshold=task.task_config.null_score_diff_threshold,
+          verbose=False))
+  return all_predictions, all_nbest, scores_diff
--- a/official/nlp/tasks/question_answering_test.py
+++ b/official/nlp/tasks/question_answering_test.py
@@ -25,6 +25,7 @@ from official.nlp.bert import export_tfhub
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
 from official.nlp.data import question_answering_dataloader
+from official.nlp.tasks import masked_lm
 from official.nlp.tasks import question_answering


@@ -32,21 +33,37 @@ class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):

  def setUp(self):
    super(QuestionAnsweringTaskTest, self).setUp()
-    self._encoder_config = encoders.TransformerEncoderConfig(
-        vocab_size=30522, num_layers=1)
+    self._encoder_config = encoders.EncoderConfig(
+        bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1))
    self._train_data_config = question_answering_dataloader.QADataConfig(
-        input_path="dummy",
-        seq_length=128,
-        global_batch_size=1)
-
-    val_data = {"version": "1.1",
-                "data": [{"paragraphs": [
-                    {"context": "Sky is blue.",
-                     "qas": [{"question": "What is blue?", "id": "1234",
-                              "answers": [{"text": "Sky", "answer_start": 0},
-                                          {"text": "Sky", "answer_start": 0},
-                                          {"text": "Sky", "answer_start": 0}]
-                              }]}]}]}
+        input_path="dummy", seq_length=128, global_batch_size=1)
+
+    val_data = {
+        "version":
+            "1.1",
+        "data": [{
+            "paragraphs": [{
+                "context":
+                    "Sky is blue.",
+                "qas": [{
+                    "question":
+                        "What is blue?",
+                    "id":
+                        "1234",
+                    "answers": [{
+                        "text": "Sky",
+                        "answer_start": 0
+                    }, {
+                        "text": "Sky",
+                        "answer_start": 0
+                    }, {
+                        "text": "Sky",
+                        "answer_start": 0
+                    }]
+                }]
+            }]
+        }]
+    }
    self._val_input_path = os.path.join(self.get_temp_dir(), "val_data.json")
    with tf.io.gfile.GFile(self._val_input_path, "w") as writer:
      writer.write(json.dumps(val_data, indent=4) + "\n")
@@ -81,23 +98,26 @@ class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
    val_dataset = task.build_inputs(config.validation_data)
    val_iterator = iter(val_dataset)
    logs = task.validation_step(next(val_iterator), model, metrics=metrics)
+    # Mock that `logs` is from one replica.
+    logs = {x: (logs[x],) for x in logs}
    logs = task.aggregate_logs(step_outputs=logs)
    metrics = task.reduce_aggregated_logs(logs)
    self.assertIn("final_f1", metrics)

-  @parameterized.parameters(itertools.product(
+  @parameterized.parameters(
+      itertools.product(
          (False, True),
          ("WordPiece", "SentencePiece"),
      ))
  def test_task(self, version_2_with_negative, tokenization):
    # Saves a checkpoint.
-    pretrain_cfg = bert.BertPretrainerConfig(
+    pretrain_cfg = bert.PretrainerConfig(
        encoder=self._encoder_config,
        cls_heads=[
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=3, name="next_sentence")
        ])
-    pretrain_model = bert.instantiate_pretrainer_from_cfg(pretrain_cfg)
+    pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg)
    ckpt = tf.train.Checkpoint(
        model=pretrain_model, **pretrain_model.checkpoint_items)
    saved_path = ckpt.save(self.get_temp_dir())
@@ -160,6 +180,27 @@ class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
        validation_data=self._get_validation_data_config())
    self._run_task(config)

+  @parameterized.named_parameters(("squad1", False), ("squad2", True))
+  def test_predict(self, version_2_with_negative):
+    validation_data = self._get_validation_data_config(
+        version_2_with_negative=version_2_with_negative)
+
+    config = question_answering.QuestionAnsweringConfig(
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        validation_data=validation_data)
+    task = question_answering.QuestionAnsweringTask(config)
+    model = task.build_model()
+
+    all_predictions, all_nbest, scores_diff = question_answering.predict(
+        task, validation_data, model)
+    self.assertLen(all_predictions, 1)
+    self.assertLen(all_nbest, 1)
+    if version_2_with_negative:
+      self.assertLen(scores_diff, 1)
+    else:
+      self.assertEmpty(scores_diff)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -26,6 +26,7 @@ import tensorflow as tf
 import tensorflow_hub as hub

 from official.core import base_task
+from official.core import task_factory
 from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import encoders
@@ -43,8 +44,7 @@ class ModelConfig(base_config.Config):
  """A classifier/regressor configuration."""
  num_classes: int = 0
  use_encoder_pooler: bool = False
-  encoder: encoders.TransformerEncoderConfig = (
-      encoders.TransformerEncoderConfig())
+  encoder: encoders.EncoderConfig = encoders.EncoderConfig()


 @dataclasses.dataclass
@@ -62,7 +62,7 @@ class SentencePredictionConfig(cfg.TaskConfig):
  validation_data: cfg.DataConfig = cfg.DataConfig()


-@base_task.register_task_cls(SentencePredictionConfig)
+@task_factory.register_task_cls(SentencePredictionConfig)
 class SentencePredictionTask(base_task.Task):
  """Task object for sentence_prediction."""

@@ -84,15 +84,14 @@ class SentencePredictionTask(base_task.Task):
    if self._hub_module:
      encoder_network = utils.get_encoder_from_hub(self._hub_module)
    else:
-      encoder_network = encoders.instantiate_encoder_from_cfg(
-          self.task_config.model.encoder)
-
+      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
+    encoder_cfg = self.task_config.model.encoder.get()
    # Currently, we only support bert-style sentence prediction finetuning.
    return models.BertClassifier(
        network=encoder_network,
        num_classes=self.task_config.model.num_classes,
        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=self.task_config.model.encoder.initializer_range),
+            stddev=encoder_cfg.initializer_range),
        use_encoder_pooler=self.task_config.model.use_encoder_pooler)

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
@@ -244,11 +243,7 @@ def predict(task: SentencePredictionTask, params: cfg.DataConfig,
  """
  is_regression = task.task_config.model.num_classes == 1

-  @tf.function
-  def predict_step(iterator):
-    """Predicts on distributed devices."""
-
-    def _replicated_step(inputs):
+  def predict_step(inputs):
    """Replicated prediction calculation."""
    x, _ = inputs
    outputs = task.inference_step(x, model)
@@ -257,21 +252,16 @@ def predict(task: SentencePredictionTask, params: cfg.DataConfig,
    else:
      return tf.argmax(outputs, axis=-1)

-    outputs = tf.distribute.get_strategy().run(
-        _replicated_step, args=(next(iterator),))
-    return tf.nest.map_structure(
-        tf.distribute.get_strategy().experimental_local_results, outputs)
-
-  def reduce_fn(state, outputs):
+  def aggregate_fn(state, outputs):
    """Concatenates model's outputs."""
+    if state is None:
+      state = {'predictions': []}
+
    for per_replica_batch_predictions in outputs:
-      state.extend(per_replica_batch_predictions)
+      state['predictions'].extend(per_replica_batch_predictions)
    return state

-  loop_fn = orbit.utils.create_loop_fn(predict_step)
  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
                                                 task.build_inputs, params)
-  # Set `num_steps` to -1 to exhaust the dataset.
-  predictions = loop_fn(
-      iter(dataset), num_steps=-1, state=[], reduce_fn=reduce_fn)
-  return predictions
+  outputs = utils.predict(predict_step, aggregate_fn, dataset)
+  return outputs['predictions']
--- a/official/nlp/tasks/sentence_prediction_test.py
+++ b/official/nlp/tasks/sentence_prediction_test.py
@@ -26,6 +26,7 @@ from official.nlp.bert import export_tfhub
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
 from official.nlp.data import sentence_prediction_dataloader
+from official.nlp.tasks import masked_lm
 from official.nlp.tasks import sentence_prediction


@@ -68,8 +69,8 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):

  def get_model_config(self, num_classes):
    return sentence_prediction.ModelConfig(
-        encoder=encoders.TransformerEncoderConfig(
-            vocab_size=30522, num_layers=1),
+        encoder=encoders.EncoderConfig(
+            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)),
        num_classes=num_classes)

  def _run_task(self, config):
@@ -102,14 +103,14 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
    task.validation_step(next(iterator), model, metrics=metrics)

    # Saves a checkpoint.
-    pretrain_cfg = bert.BertPretrainerConfig(
-        encoder=encoders.TransformerEncoderConfig(
-            vocab_size=30522, num_layers=1),
+    pretrain_cfg = bert.PretrainerConfig(
+        encoder=encoders.EncoderConfig(
+            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)),
        cls_heads=[
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=3, name="next_sentence")
        ])
-    pretrain_model = bert.instantiate_pretrainer_from_cfg(pretrain_cfg)
+    pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg)
    ckpt = tf.train.Checkpoint(
        model=pretrain_model, **pretrain_model.checkpoint_items)
    ckpt.save(config.init_checkpoint)
@@ -136,8 +137,8 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
    if num_classes == 1:
      self.assertIsInstance(metrics[0], tf.keras.metrics.MeanSquaredError)
    else:
-      self.assertIsInstance(
-          metrics[0], tf.keras.metrics.SparseCategoricalAccuracy)
+      self.assertIsInstance(metrics[0],
+                            tf.keras.metrics.SparseCategoricalAccuracy)

    dataset = task.build_inputs(config.train_data)
    iterator = iter(dataset)
@@ -147,9 +148,9 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
    logs = task.validation_step(next(iterator), model, metrics=metrics)
    loss = logs["loss"].numpy()
    if num_classes == 1:
-      self.assertAlmostEqual(loss, 42.77483, places=3)
+      self.assertGreater(loss, 1.0)
    else:
-      self.assertAlmostEqual(loss, 3.57627e-6, places=3)
+      self.assertLess(loss, 1.0)

  @parameterized.parameters(("matthews_corrcoef", 2),
                            ("pearson_spearman_corr", 1))

--- a/official/nlp/tasks/tagging.py
+++ b/official/nlp/tasks/tagging.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 # ==============================================================================
 """Tagging (e.g., NER/POS) task."""
-import logging
 from typing import List, Optional, Tuple

 import dataclasses
@@ -26,6 +25,7 @@ import tensorflow as tf
 import tensorflow_hub as hub

 from official.core import base_task
+from official.core import task_factory
 from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import encoders
@@ -37,8 +37,7 @@ from official.nlp.tasks import utils
 @dataclasses.dataclass
 class ModelConfig(base_config.Config):
  """A base span labeler configuration."""
-  encoder: encoders.TransformerEncoderConfig = (
-      encoders.TransformerEncoderConfig())
+  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
  head_dropout: float = 0.1
  head_initializer_range: float = 0.02

@@ -81,7 +80,7 @@ def _masked_labels_and_weights(y_true):
  return masked_y_true, tf.cast(mask, tf.float32)


-@base_task.register_task_cls(TaggingConfig)
+@task_factory.register_task_cls(TaggingConfig)
 class TaggingTask(base_task.Task):
  """Task object for tagging (e.g., NER or POS)."""

@@ -102,8 +101,7 @@ class TaggingTask(base_task.Task):
    if self._hub_module:
      encoder_network = utils.get_encoder_from_hub(self._hub_module)
    else:
-      encoder_network = encoders.instantiate_encoder_from_cfg(
-          self.task_config.model.encoder)
+      encoder_network = encoders.build_encoder(self.task_config.model.encoder)

    return models.BertTokenClassifier(
        network=encoder_network,
@@ -215,20 +213,6 @@ class TaggingTask(base_task.Task):
            seqeval_metrics.accuracy_score(label_class, predict_class),
    }

-  def initialize(self, model):
-    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
-    ckpt_dir_or_file = self.task_config.init_checkpoint
-    if tf.io.gfile.isdir(ckpt_dir_or_file):
-      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
-    if not ckpt_dir_or_file:
-      return
-
-    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
-    status = ckpt.restore(ckpt_dir_or_file)
-    status.expect_partial().assert_existing_objects_matched()
-    logging.info('Finished loading pretrained checkpoint from %s',
-                 ckpt_dir_or_file)
-

 def predict(task: TaggingTask, params: cfg.DataConfig,
            model: tf.keras.Model) -> Tuple[List[List[int]], List[int]]:
@@ -246,11 +230,7 @@ def predict(task: TaggingTask, params: cfg.DataConfig,
      sentence id of the corresponding example.
  """

-  @tf.function
-  def predict_step(iterator):
-    """Predicts on distributed devices."""
-
-    def _replicated_step(inputs):
+  def predict_step(inputs):
    """Replicated prediction calculation."""
    x, y = inputs
    sentence_ids = x.pop('sentence_id')
@@ -262,14 +242,13 @@ def predict(task: TaggingTask, params: cfg.DataConfig,
        label_mask=label_mask,
        sentence_ids=sentence_ids)

-    outputs = tf.distribute.get_strategy().run(
-        _replicated_step, args=(next(iterator),))
-    return tf.nest.map_structure(
-        tf.distribute.get_strategy().experimental_local_results, outputs)
-
-  def reduce_fn(state, outputs):
+  def aggregate_fn(state, outputs):
    """Concatenates model's outputs."""
-    cur_predict_ids, cur_sentence_ids = state
+    if state is None:
+      state = {'predict_ids': [], 'sentence_ids': []}
+
+    cur_predict_ids = state['predict_ids']
+    cur_sentence_ids = state['sentence_ids']
    for batch_predict_ids, batch_label_mask, batch_sentence_ids in zip(
        outputs['predict_ids'], outputs['label_mask'],
        outputs['sentence_ids']):
@@ -283,12 +262,9 @@ def predict(task: TaggingTask, params: cfg.DataConfig,
          # Skip the padding label.
          if tmp_label_mask[i]:
            cur_predict_ids[-1].append(tmp_predict_ids[i])
-    return cur_predict_ids, cur_sentence_ids
+    return state

-  loop_fn = orbit.utils.create_loop_fn(predict_step)
  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
                                                 task.build_inputs, params)
-  # Set `num_steps` to -1 to exhaust the dataset.
-  predict_ids, sentence_ids = loop_fn(
-      iter(dataset), num_steps=-1, state=([], []), reduce_fn=reduce_fn)
-  return predict_ids, sentence_ids
+  outputs = utils.predict(predict_step, aggregate_fn, dataset)
+  return outputs['predict_ids'], outputs['sentence_ids']
--- a/official/nlp/tasks/tagging_test.py
+++ b/official/nlp/tasks/tagging_test.py
@@ -53,8 +53,8 @@ class TaggingTest(tf.test.TestCase):

  def setUp(self):
    super(TaggingTest, self).setUp()
-    self._encoder_config = encoders.TransformerEncoderConfig(
-        vocab_size=30522, num_layers=1)
+    self._encoder_config = encoders.EncoderConfig(
+        bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1))
    self._train_data_config = tagging_data_loader.TaggingDataConfig(
        input_path="dummy", seq_length=128, global_batch_size=1)

@@ -74,7 +74,7 @@ class TaggingTest(tf.test.TestCase):

  def test_task(self):
    # Saves a checkpoint.
-    encoder = encoders.instantiate_encoder_from_cfg(self._encoder_config)
+    encoder = encoders.build_encoder(self._encoder_config)
    ckpt = tf.train.Checkpoint(encoder=encoder)
    saved_path = ckpt.save(self.get_temp_dir())


--- a/official/nlp/tasks/utils.py
+++ b/official/nlp/tasks/utils.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 # ==============================================================================
 """Common utils for tasks."""
+from typing import Any, Callable
+
+import orbit
 import tensorflow as tf
 import tensorflow_hub as hub

@@ -32,3 +35,34 @@ def get_encoder_from_hub(hub_module: str) -> tf.keras.Model:
  return tf.keras.Model(
      inputs=[input_word_ids, input_mask, input_type_ids],
      outputs=[sequence_output, pooled_output])
+
+
+def predict(predict_step_fn: Callable[[Any], Any],
+            aggregate_fn: Callable[[Any, Any], Any],
+            dataset: tf.data.Dataset):
+  """Runs prediction.
+
+  Args:
+    predict_step_fn: A callable such as `def predict_step(inputs)`, where
+      `inputs` are input tensors.
+    aggregate_fn: A callable such as `def aggregate_fn(state, value)`, where
+        `value` is the outputs from `predict_step_fn`.
+    dataset: A `tf.data.Dataset` object.
+
+  Returns:
+    The aggregated predictions.
+  """
+
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+    outputs = tf.distribute.get_strategy().run(
+        predict_step_fn, args=(next(iterator),))
+    return tf.nest.map_structure(
+        tf.distribute.get_strategy().experimental_local_results, outputs)
+
+  loop_fn = orbit.utils.create_loop_fn(predict_step)
+  # Set `num_steps` to -1 to exhaust the dataset.
+  outputs = loop_fn(
+      iter(dataset), num_steps=-1, state=None, reduce_fn=aggregate_fn)  # pytype: disable=wrong-arg-types
+  return outputs
--- a/official/nlp/transformer/compute_bleu.py
+++ b/official/nlp/transformer/compute_bleu.py
@@ -26,7 +26,7 @@ import re
 import sys
 import unicodedata

-from absl import app as absl_app
+from absl import app
 from absl import flags
 import six
 from six.moves import range
@@ -149,4 +149,4 @@ if __name__ == "__main__":
  tf.logging.set_verbosity(tf.logging.INFO)
  define_compute_bleu_flags()
  FLAGS = flags.FLAGS
-  absl_app.run(main)
+  app.run(main)
--- a/official/nlp/transformer/data_download.py
+++ b/official/nlp/transformer/data_download.py
@@ -23,7 +23,7 @@ import random
 import tarfile

 # pylint: disable=g-bad-import-order
-from absl import app as absl_app
+from absl import app
 from absl import flags
 from absl import logging
 import six
@@ -436,4 +436,4 @@ if __name__ == "__main__":
  logging.set_verbosity(logging.INFO)
  define_data_download_flags()
  FLAGS = flags.FLAGS
-  absl_app.run(main)
+  app.run(main)
--- a/official/nlp/xlnet/xlnet_modeling.py
+++ b/official/nlp/xlnet/xlnet_modeling.py
@@ -14,32 +14,14 @@
 # ==============================================================================
 """Keras layers of XLNet model in TF 2.0."""

-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
 import copy
-import numpy as np

 import tensorflow as tf
 from official.nlp.xlnet import data_utils


 def gelu(x):
-  """Gaussian Error Linear Unit.
-
-  This is a smoother version of the RELU.
-  Original paper: https://arxiv.org/abs/1606.08415
-  Args:
-    x: float Tensor to perform activation.
-
-  Returns:
-    `x` with the GELU activation applied.
-  """
-  cdf = 0.5 * (1.0 + tf.tanh(
-      (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-  return x * cdf
+  return tf.keras.activations.gelu(x, approximate=True)


 def rel_shift(x, klen=-1):

--- a/official/pip_package/setup.py
+++ b/official/pip_package/setup.py
@@ -20,7 +20,7 @@ import sys
 from setuptools import find_packages
 from setuptools import setup

-version = '2.2.0'
+version = '2.3.0'

 project_name = 'tf-models-official'

@@ -60,7 +60,7 @@ if project_name == 'tf-models-nightly':
  version += '.dev' + datetime.datetime.now().strftime('%Y%m%d')
  install_requires.append('tf-nightly')
 else:
-  install_requires.append('tensorflow>=2.2.0')
+  install_requires.append('tensorflow>=2.3.0')

 print('install_requires: ', install_requires)
 print('dependency_links: ', dependency_links)

--- a/official/recommendation/ncf_input_pipeline.py
+++ b/official/recommendation/ncf_input_pipeline.py
@@ -32,7 +32,8 @@ from official.recommendation import movielens
 def create_dataset_from_tf_record_files(input_file_pattern,
                                        pre_batch_size,
                                        batch_size,
-                                        is_training=True):
+                                        is_training=True,
+                                        rebatch=False):
  """Creates dataset from (tf)records files for training/evaluation."""
  if pre_batch_size != batch_size:
    raise ValueError("Pre-batch ({}) size is not equal to batch "
@@ -51,6 +52,12 @@ def create_dataset_from_tf_record_files(input_file_pattern,
  dataset = dataset.map(
      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

+  if rebatch:
+    # A workaround for TPU Pod evaluation dataset.
+    # TODO (b/162341937) remove once it's fixed.
+    dataset = dataset.unbatch()
+    dataset = dataset.batch(pre_batch_size)
+
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset

@@ -151,12 +158,18 @@ def create_ncf_input_data(params,
        params["train_dataset_path"],
        input_meta_data["train_prebatch_size"],
        params["batch_size"],
-        is_training=True)
+        is_training=True,
+        rebatch=False)
+
+    # Re-batch evaluation dataset for TPU Pods.
+    # TODO (b/162341937) remove once it's fixed.
+    eval_rebatch = (params["use_tpu"] and strategy.num_replicas_in_sync > 8)
    eval_dataset = create_dataset_from_tf_record_files(
        params["eval_dataset_path"],
        input_meta_data["eval_prebatch_size"],
        params["eval_batch_size"],
-        is_training=False)
+        is_training=False,
+        rebatch=eval_rebatch)

    num_train_steps = int(input_meta_data["num_train_steps"])
    num_eval_steps = int(input_meta_data["num_eval_steps"])

--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
@@ -235,6 +235,7 @@ def run_ncf(_):

  params = ncf_common.parse_flags(FLAGS)
  params["distribute_strategy"] = strategy
+  params["use_tpu"] = (FLAGS.distribution_strategy == "tpu")

  if params["use_tpu"] and not params["keras_use_ctl"]:
    logging.error("Custom training loop must be used when using TPUStrategy.")
@@ -491,7 +492,8 @@ def run_ncf_custom_training(params,
    logging.info("Done training epoch %s, epoch loss=%.3f", epoch + 1,
                 train_loss)

-    eval_input_iterator = iter(eval_input_dataset)
+    eval_input_iterator = iter(
+        strategy.experimental_distribute_dataset(eval_input_dataset))

    hr_sum = 0.0
    hr_count = 0.0

--- a/official/requirements.txt
+++ b/official/requirements.txt
@@ -3,7 +3,7 @@ google-api-python-client>=1.6.7
 google-cloud-bigquery>=0.31.0
 kaggle>=1.3.9
 numpy>=1.15.4
-oauth2client>=4.1.2
+oauth2client
 pandas>=0.22.0
 psutil>=5.4.3
 py-cpuinfo>=3.3.0
@@ -21,7 +21,7 @@ pyyaml
 # CV related dependencies
 opencv-python-headless
 Pillow
-e git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI
+pycocotools
 # NLP related dependencies
 seqeval
 sentencepiece
--- a/official/utils/misc/keras_utils.py
+++ b/official/utils/misc/keras_utils.py
@@ -26,6 +26,12 @@ from absl import logging
 import tensorflow as tf


+from tensorflow.python.eager import monitoring
+
+global_batch_size_gauge = monitoring.IntGauge(
+    '/tensorflow/training/global_batch_size', 'TF training global batch size')
+
+
 class BatchTimestamp(object):
  """A structure to store batch time stamp."""

@@ -60,6 +66,8 @@ class TimeHistory(tf.keras.callbacks.Callback):
    self.steps_in_epoch = 0
    self.start_time = None

+    global_batch_size_gauge.get_cell().set(batch_size)
+
    if logdir:
      self.summary_writer = tf.summary.create_file_writer(logdir)
    else:

--- a/official/utils/testing/mock_task.py
+++ b/official/utils/testing/mock_task.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mock task for testing."""
+
+import dataclasses
+import numpy as np
+import tensorflow as tf
+
+from official.core import base_task
+from official.core import exp_factory
+from official.core import task_factory
+from official.modeling.hyperparams import config_definitions as cfg
+
+
+class MockModel(tf.keras.Model):
+
+  def __init__(self, network):
+    super().__init__()
+    self.network = network
+
+  def call(self, inputs):
+    outputs = self.network(inputs)
+    self.add_loss(tf.reduce_mean(outputs))
+    return outputs
+
+
+@dataclasses.dataclass
+class MockTaskConfig(cfg.TaskConfig):
+  pass
+
+
+@task_factory.register_task_cls(MockTaskConfig)
+class MockTask(base_task.Task):
+  """Mock task object for testing."""
+
+  def __init__(self, params=None, logging_dir=None):
+    super().__init__(params=params, logging_dir=logging_dir)
+
+  def build_model(self, *arg, **kwargs):
+    inputs = tf.keras.layers.Input(shape=(2,), name="random", dtype=tf.float32)
+    outputs = tf.keras.layers.Dense(1)(inputs)
+    network = tf.keras.Model(inputs=inputs, outputs=outputs)
+    return MockModel(network)
+
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="acc")]
+
+  def build_inputs(self, params):
+
+    def generate_data(_):
+      x = tf.zeros(shape=(2,), dtype=tf.float32)
+      label = tf.zeros([1], dtype=tf.int32)
+      return x, label
+
+    dataset = tf.data.Dataset.range(1)
+    dataset = dataset.repeat()
+    dataset = dataset.map(
+        generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
+
+  def aggregate_logs(self, state, step_outputs):
+    if state is None:
+      state = {}
+    for key, value in step_outputs.items():
+      if key not in state:
+        state[key] = []
+      state[key].append(
+          np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs):
+    for k, v in aggregated_logs.items():
+      aggregated_logs[k] = np.sum(np.stack(v, axis=0))
+    return aggregated_logs
+
+
+@exp_factory.register_config_factory("mock")
+def mock_experiment() -> cfg.ExperimentConfig:
+  config = cfg.ExperimentConfig(
+      task=MockTaskConfig(), trainer=cfg.TrainerConfig())
+  return config
--- a/official/vision/detection/configs/maskrcnn_config.py
+++ b/official/vision/detection/configs/maskrcnn_config.py
@@ -52,7 +52,6 @@ MASKRCNN_CFG.override({
        'anchor_size': 8,
    },
    'rpn_head': {
-        'anchors_per_location': 3,
        'num_convs': 2,
        'num_filters': 256,
        'use_separable_conv': False,

--- a/official/vision/detection/configs/retinanet_config.py
+++ b/official/vision/detection/configs/retinanet_config.py
@@ -39,7 +39,6 @@ RETINANET_CFG.override({
        'max_num_instances': 100,
    },
    'retinanet_head': {
-        'anchors_per_location': 9,
        'num_convs': 4,
        'num_filters': 256,
        'use_separable_conv': False,

--- a/official/vision/detection/configs/shapemask_config.py
+++ b/official/vision/detection/configs/shapemask_config.py
@@ -62,7 +62,6 @@ SHAPEMASK_CFG.override({
        'upsample_factor': 4,
    },
    'retinanet_head': {
-        'anchors_per_location': 9,
        'num_convs': 4,
        'num_filters': 256,
        'use_separable_conv': False,

--- a/official/vision/detection/dataloader/anchor.py
+++ b/official/vision/detection/dataloader/anchor.py
@@ -46,15 +46,15 @@ class Anchor(object):
      num_scales: integer number representing intermediate scales added
        on each level. For instances, num_scales=2 adds one additional
        intermediate anchor scales [2^0, 2^0.5] on each level.
-      aspect_ratios: list of float numbers representing the aspect raito anchors
+      aspect_ratios: list of float numbers representing the aspect ratio anchors
        added on each level. The number indicates the ratio of width to height.
        For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
        scale level.
      anchor_size: float number representing the scale of size of the base
        anchor to the feature stride 2^level.
      image_size: a list of integer numbers or Tensors representing
-        [height, width] of the input image size.The image_size should be divided
-        by the largest feature stride 2^max_level.
+        [height, width] of the input image size.The image_size should be
+        divisible by the largest feature stride 2^max_level.
    """
    self.min_level = min_level
    self.max_level = max_level
@@ -77,8 +77,8 @@ class Anchor(object):
      for scale in range(self.num_scales):
        for aspect_ratio in self.aspect_ratios:
          stride = 2 ** level
-          intermidate_scale = 2 ** (scale / float(self.num_scales))
-          base_anchor_size = self.anchor_size * stride * intermidate_scale
+          intermediate_scale = 2 ** (scale / float(self.num_scales))
+          base_anchor_size = self.anchor_size * stride * intermediate_scale
          aspect_x = aspect_ratio ** 0.5
          aspect_y = aspect_ratio ** -0.5
          half_anchor_size_x = base_anchor_size * aspect_x / 2.0