resovle merge conflicts

31ca3b97 · Kaushik Shivakumar · 3e9d886d · 7fcd7cba · 31ca3b97 · 31ca3b97
Commit 31ca3b97 authored Jul 23, 2020 by Kaushik Shivakumar
20 changed files
--- a/official/nlp/tasks/electra_task_test.py
+++ b/official/nlp/tasks/electra_task_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.nlp.tasks.electra_task."""
+
+import tensorflow as tf
+
+from official.nlp.configs import bert
+from official.nlp.configs import electra
+from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
+from official.nlp.tasks import electra_task
+
+
+class ELECTRAPretrainTaskTest(tf.test.TestCase):
+
+  def test_task(self):
+    config = electra_task.ELECTRAPretrainConfig(
+        model=electra.ELECTRAPretrainerConfig(
+            generator_encoder=encoders.TransformerEncoderConfig(
+                vocab_size=30522, num_layers=1),
+            discriminator_encoder=encoders.TransformerEncoderConfig(
+                vocab_size=30522, num_layers=1),
+            num_masked_tokens=20,
+            sequence_length=128,
+            cls_heads=[
+                bert.ClsHeadConfig(
+                    inner_dim=10, num_classes=2, name="next_sentence")
+            ]),
+        train_data=pretrain_dataloader.BertPretrainDataConfig(
+            input_path="dummy",
+            max_predictions_per_seq=20,
+            seq_length=128,
+            global_batch_size=1))
+    task = electra_task.ELECTRAPretrainTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    dataset = task.build_inputs(config.train_data)
+
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/tasks/masked_lm.py
+++ b/official/nlp/tasks/masked_lm.py
@@ -14,19 +14,20 @@
 # limitations under the License.
 # ==============================================================================
 """Masked language task."""
+from absl import logging
 import dataclasses
 import tensorflow as tf

 from official.core import base_task
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import bert
-from official.nlp.data import pretrain_dataloader
-from official.nlp.modeling import losses as loss_lib
+from official.nlp.data import data_loader_factory


 @dataclasses.dataclass
 class MaskedLMConfig(cfg.TaskConfig):
  """The model config."""
+  init_checkpoint: str = ''
  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(cls_heads=[
      bert.ClsHeadConfig(
          inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence')
@@ -39,8 +40,9 @@ class MaskedLMConfig(cfg.TaskConfig):
 class MaskedLMTask(base_task.Task):
  """Mock task object for testing."""

-  def build_model(self):
-    return bert.instantiate_bertpretrainer_from_cfg(self.task_config.model)
+  def build_model(self, params=None):
+    params = params or self.task_config.model
+    return bert.instantiate_pretrainer_from_cfg(params)

  def build_losses(self,
                   labels,
@@ -61,9 +63,10 @@ class MaskedLMTask(base_task.Task):
      sentence_labels = labels['next_sentence_labels']
      sentence_outputs = tf.cast(
          model_outputs['next_sentence'], dtype=tf.float32)
-      sentence_loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
-          labels=sentence_labels,
-          predictions=tf.nn.log_softmax(sentence_outputs, axis=-1))
+      sentence_loss = tf.reduce_mean(
+          tf.keras.losses.sparse_categorical_crossentropy(sentence_labels,
+                                                          sentence_outputs,
+                                                          from_logits=True))
      metrics['next_sentence_loss'].update_state(sentence_loss)
      total_loss = mlm_loss + sentence_loss
    else:
@@ -95,8 +98,7 @@ class MaskedLMTask(base_task.Task):
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset

-    return pretrain_dataloader.BertPretrainDataLoader(params).load(
-        input_context)
+    return data_loader_factory.get_data_loader(params).load(input_context)

  def build_metrics(self, training=None):
    del training
@@ -172,3 +174,17 @@ class MaskedLMTask(base_task.Task):
        aux_losses=model.losses)
    self.process_metrics(metrics, inputs, outputs)
    return {self.loss: loss}
+
+  def initialize(self, model: tf.keras.Model):
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+    # Restoring all modules defined by the model, e.g. encoder, masked_lm and
+    # cls pooler. The best initialization may vary case by case.
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    status = ckpt.read(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
--- a/official/nlp/tasks/masked_lm_test.py
+++ b/official/nlp/tasks/masked_lm_test.py
@@ -19,6 +19,7 @@ import tensorflow as tf

 from official.nlp.configs import bert
 from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
 from official.nlp.tasks import masked_lm


@@ -26,14 +27,14 @@ class MLMTaskTest(tf.test.TestCase):

  def test_task(self):
    config = masked_lm.MaskedLMConfig(
+        init_checkpoint=self.get_temp_dir(),
        model=bert.BertPretrainerConfig(
            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
-            num_masked_tokens=20,
            cls_heads=[
                bert.ClsHeadConfig(
                    inner_dim=10, num_classes=2, name="next_sentence")
            ]),
-        train_data=bert.BertPretrainDataConfig(
+        train_data=pretrain_dataloader.BertPretrainDataConfig(
            input_path="dummy",
            max_predictions_per_seq=20,
            seq_length=128,
@@ -48,6 +49,12 @@ class MLMTaskTest(tf.test.TestCase):
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)

+    # Saves a checkpoint.
+    ckpt = tf.train.Checkpoint(
+        model=model, **model.checkpoint_items)
+    ckpt.save(config.init_checkpoint)
+    task.initialize(model)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tasks/question_answering.py
+++ b/official/nlp/tasks/question_answering.py
@@ -14,40 +14,55 @@
 # limitations under the License.
 # ==============================================================================
 """Question answering task."""
-import logging
+import collections
+import json
+import os
+from absl import logging
 import dataclasses
 import tensorflow as tf
 import tensorflow_hub as hub

 from official.core import base_task
+from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
-from official.nlp.bert import input_pipeline
+from official.nlp.bert import squad_evaluate_v1_1
+from official.nlp.bert import squad_evaluate_v2_0
+from official.nlp.bert import tokenization
 from official.nlp.configs import encoders
+from official.nlp.data import data_loader_factory
+from official.nlp.data import squad_lib as squad_lib_wp
+from official.nlp.data import squad_lib_sp
 from official.nlp.modeling import models
 from official.nlp.tasks import utils


+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A base span labeler configuration."""
+  encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+
+
 @dataclasses.dataclass
 class QuestionAnsweringConfig(cfg.TaskConfig):
  """The model config."""
  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
  init_checkpoint: str = ''
  hub_module_url: str = ''
-  model: encoders.TransformerEncoderConfig = (
-      encoders.TransformerEncoderConfig())
+  n_best_size: int = 20
+  max_answer_length: int = 30
+  null_score_diff_threshold: float = 0.0
+  model: ModelConfig = ModelConfig()
  train_data: cfg.DataConfig = cfg.DataConfig()
  validation_data: cfg.DataConfig = cfg.DataConfig()


 @base_task.register_task_cls(QuestionAnsweringConfig)
 class QuestionAnsweringTask(base_task.Task):
-  """Task object for question answering.
-
-  TODO(lehou): Add post-processing.
-  """
+  """Task object for question answering."""

-  def __init__(self, params=cfg.TaskConfig):
-    super(QuestionAnsweringTask, self).__init__(params)
+  def __init__(self, params=cfg.TaskConfig, logging_dir=None):
+    super(QuestionAnsweringTask, self).__init__(params, logging_dir)
    if params.hub_module_url and params.init_checkpoint:
      raise ValueError('At most one of `hub_module_url` and '
                       '`init_checkpoint` can be specified.')
@@ -56,17 +71,29 @@ class QuestionAnsweringTask(base_task.Task):
    else:
      self._hub_module = None

+    if params.validation_data.tokenization == 'WordPiece':
+      self.squad_lib = squad_lib_wp
+    elif params.validation_data.tokenization == 'SentencePiece':
+      self.squad_lib = squad_lib_sp
+    else:
+      raise ValueError('Unsupported tokenization method: {}'.format(
+          params.validation_data.tokenization))
+
+    if params.validation_data.input_path:
+      self._tf_record_input_path, self._eval_examples, self._eval_features = (
+          self._preprocess_eval_data(params.validation_data))
+
  def build_model(self):
    if self._hub_module:
      encoder_network = utils.get_encoder_from_hub(self._hub_module)
    else:
      encoder_network = encoders.instantiate_encoder_from_cfg(
-          self.task_config.model)
-
+          self.task_config.model.encoder)
+    # Currently, we only supports bert-style question answering finetuning.
    return models.BertSpanLabeler(
        network=encoder_network,
        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=self.task_config.model.initializer_range))
+            stddev=self.task_config.model.encoder.initializer_range))

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
    start_positions = labels['start_positions']
@@ -85,9 +112,57 @@ class QuestionAnsweringTask(base_task.Task):
    loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
    return loss

+  def _preprocess_eval_data(self, params):
+    eval_examples = self.squad_lib.read_squad_examples(
+        input_file=params.input_path,
+        is_training=False,
+        version_2_with_negative=params.version_2_with_negative)
+
+    temp_file_path = params.input_preprocessed_data_path or self.logging_dir
+    if not temp_file_path:
+      raise ValueError('You must specify a temporary directory, either in '
+                       'params.input_preprocessed_data_path or logging_dir to '
+                       'store intermediate evaluation TFRecord data.')
+    eval_writer = self.squad_lib.FeatureWriter(
+        filename=os.path.join(temp_file_path, 'eval.tf_record'),
+        is_training=False)
+    eval_features = []
+
+    def _append_feature(feature, is_padding):
+      if not is_padding:
+        eval_features.append(feature)
+      eval_writer.process_feature(feature)
+
+    kwargs = dict(
+        examples=eval_examples,
+        tokenizer=tokenization.FullTokenizer(
+            vocab_file=params.vocab_file,
+            do_lower_case=params.do_lower_case),
+        max_seq_length=params.seq_length,
+        doc_stride=params.doc_stride,
+        max_query_length=params.query_length,
+        is_training=False,
+        output_fn=_append_feature,
+        batch_size=params.global_batch_size)
+    if params.tokenization == 'SentencePiece':
+      # squad_lib_sp requires one more argument 'do_lower_case'.
+      kwargs['do_lower_case'] = params.do_lower_case
+
+    eval_dataset_size = self.squad_lib.convert_examples_to_features(**kwargs)
+    eval_writer.close()
+
+    logging.info('***** Evaluation input stats *****')
+    logging.info('  Num orig examples = %d', len(eval_examples))
+    logging.info('  Num split examples = %d', len(eval_features))
+    logging.info('  Batch size = %d', params.global_batch_size)
+    logging.info('  Dataset size = %d', eval_dataset_size)
+
+    return eval_writer.filename, eval_examples, eval_features
+
  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    if params.input_path == 'dummy':
+      # Dummy training data for unit test.
      def dummy_data(_):
        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
        x = dict(
@@ -105,16 +180,14 @@ class QuestionAnsweringTask(base_task.Task):
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset

-    batch_size = input_context.get_per_replica_batch_size(
-        params.global_batch_size) if input_context else params.global_batch_size
-    # TODO(chendouble): add and use nlp.data.question_answering_dataloader.
-    dataset = input_pipeline.create_squad_dataset(
-        params.input_path,
-        params.seq_length,
-        batch_size,
-        is_training=params.is_training,
-        input_pipeline_context=input_context)
-    return dataset
+    if params.is_training:
+      dataloader_params = params
+    else:
+      input_path = self._tf_record_input_path
+      dataloader_params = params.replace(input_path=input_path)
+
+    return data_loader_factory.get_data_loader(
+        dataloader_params).load(input_context)

  def build_metrics(self, training=None):
    del training
@@ -141,6 +214,70 @@ class QuestionAnsweringTask(base_task.Task):
        y_true=labels,  # labels has keys 'start_positions' and 'end_positions'.
        y_pred={'start_positions': start_logits, 'end_positions': end_logits})

+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    features, _ = inputs
+    unique_ids = features.pop('unique_ids')
+    model_outputs = self.inference_step(features, model)
+    start_logits, end_logits = model_outputs
+    logs = {
+        self.loss: 0.0,  # TODO(lehou): compute the real validation loss.
+        'unique_ids': unique_ids,
+        'start_logits': start_logits,
+        'end_logits': end_logits,
+    }
+    return logs
+
+  raw_aggregated_result = collections.namedtuple(
+      'RawResult', ['unique_id', 'start_logits', 'end_logits'])
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    assert step_outputs is not None, 'Got no logs from self.validation_step.'
+    if state is None:
+      state = []
+
+    for unique_ids, start_logits, end_logits in zip(
+        step_outputs['unique_ids'],
+        step_outputs['start_logits'],
+        step_outputs['end_logits']):
+      u_ids, s_logits, e_logits = (
+          unique_ids.numpy(), start_logits.numpy(), end_logits.numpy())
+      if u_ids.size == 1:
+        u_ids = [u_ids]
+        s_logits = [s_logits]
+        e_logits = [e_logits]
+      for values in zip(u_ids, s_logits, e_logits):
+        state.append(self.raw_aggregated_result(
+            unique_id=values[0],
+            start_logits=values[1].tolist(),
+            end_logits=values[2].tolist()))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs):
+    all_predictions, _, scores_diff = (
+        self.squad_lib.postprocess_output(
+            self._eval_examples,
+            self._eval_features,
+            aggregated_logs,
+            self.task_config.n_best_size,
+            self.task_config.max_answer_length,
+            self.task_config.validation_data.do_lower_case,
+            version_2_with_negative=(
+                self.task_config.validation_data.version_2_with_negative),
+            null_score_diff_threshold=(
+                self.task_config.null_score_diff_threshold),
+            verbose=False))
+
+    with tf.io.gfile.GFile(
+        self.task_config.validation_data.input_path, 'r') as reader:
+      dataset_json = json.load(reader)
+      pred_dataset = dataset_json['data']
+    if self.task_config.validation_data.version_2_with_negative:
+      eval_metrics = squad_evaluate_v2_0.evaluate(
+          pred_dataset, all_predictions, scores_diff)
+    else:
+      eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
+    return eval_metrics
+
  def initialize(self, model):
    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
    ckpt_dir_or_file = self.task_config.init_checkpoint
@@ -150,7 +287,7 @@ class QuestionAnsweringTask(base_task.Task):
      return

    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
-    status = ckpt.restore(ckpt_dir_or_file)
+    status = ckpt.read(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
-    logging.info('finished loading pretrained checkpoint from %s',
+    logging.info('Finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
--- a/official/nlp/tasks/question_answering_test.py
+++ b/official/nlp/tasks/question_answering_test.py
@@ -14,73 +14,107 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for official.nlp.tasks.question_answering."""
-import functools
+import itertools
+import json
 import os
+from absl.testing import parameterized
 import tensorflow as tf

 from official.nlp.bert import configs
 from official.nlp.bert import export_tfhub
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
+from official.nlp.data import question_answering_dataloader
 from official.nlp.tasks import question_answering


-class QuestionAnsweringTaskTest(tf.test.TestCase):
+class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):

  def setUp(self):
    super(QuestionAnsweringTaskTest, self).setUp()
    self._encoder_config = encoders.TransformerEncoderConfig(
        vocab_size=30522, num_layers=1)
-    self._train_data_config = bert.QADataConfig(
-        input_path="dummy", seq_length=128, global_batch_size=1)
+    self._train_data_config = question_answering_dataloader.QADataConfig(
+        input_path="dummy",
+        seq_length=128,
+        global_batch_size=1)
+
+    val_data = {"version": "1.1",
+                "data": [{"paragraphs": [
+                    {"context": "Sky is blue.",
+                     "qas": [{"question": "What is blue?", "id": "1234",
+                              "answers": [{"text": "Sky", "answer_start": 0},
+                                          {"text": "Sky", "answer_start": 0},
+                                          {"text": "Sky", "answer_start": 0}]
+                              }]}]}]}
+    self._val_input_path = os.path.join(self.get_temp_dir(), "val_data.json")
+    with tf.io.gfile.GFile(self._val_input_path, "w") as writer:
+      writer.write(json.dumps(val_data, indent=4) + "\n")
+
+    self._test_vocab = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with tf.io.gfile.GFile(self._test_vocab, "w") as writer:
+      writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")
+
+  def _get_validation_data_config(self, version_2_with_negative=False):
+    return question_answering_dataloader.QADataConfig(
+        is_training=False,
+        input_path=self._val_input_path,
+        input_preprocessed_data_path=self.get_temp_dir(),
+        seq_length=128,
+        global_batch_size=1,
+        version_2_with_negative=version_2_with_negative,
+        vocab_file=self._test_vocab,
+        tokenization="WordPiece",
+        do_lower_case=True)

  def _run_task(self, config):
    task = question_answering.QuestionAnsweringTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
+    task.initialize(model)

-    strategy = tf.distribute.get_strategy()
-    dataset = strategy.experimental_distribute_datasets_from_function(
-        functools.partial(task.build_inputs, config.train_data))
-
-    iterator = iter(dataset)
+    train_dataset = task.build_inputs(config.train_data)
+    train_iterator = iter(train_dataset)
    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    task.validation_step(next(iterator), model, metrics=metrics)
-
-  def test_task(self):
+    task.train_step(next(train_iterator), model, optimizer, metrics=metrics)
+
+    val_dataset = task.build_inputs(config.validation_data)
+    val_iterator = iter(val_dataset)
+    logs = task.validation_step(next(val_iterator), model, metrics=metrics)
+    logs = task.aggregate_logs(step_outputs=logs)
+    metrics = task.reduce_aggregated_logs(logs)
+    self.assertIn("final_f1", metrics)
+
+  @parameterized.parameters(itertools.product(
+      (False, True),
+      ("WordPiece", "SentencePiece"),
+  ))
+  def test_task(self, version_2_with_negative, tokenization):
    # Saves a checkpoint.
    pretrain_cfg = bert.BertPretrainerConfig(
        encoder=self._encoder_config,
-        num_masked_tokens=20,
        cls_heads=[
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=3, name="next_sentence")
        ])
-    pretrain_model = bert.instantiate_bertpretrainer_from_cfg(pretrain_cfg)
+    pretrain_model = bert.instantiate_pretrainer_from_cfg(pretrain_cfg)
    ckpt = tf.train.Checkpoint(
        model=pretrain_model, **pretrain_model.checkpoint_items)
    saved_path = ckpt.save(self.get_temp_dir())

    config = question_answering.QuestionAnsweringConfig(
        init_checkpoint=saved_path,
-        model=self._encoder_config,
-        train_data=self._train_data_config)
-    task = question_answering.QuestionAnsweringTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-    dataset = task.build_inputs(config.train_data)
-
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    task.validation_step(next(iterator), model, metrics=metrics)
-    task.initialize(model)
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config(
+            version_2_with_negative))
+    self._run_task(config)

  def test_task_with_fit(self):
    config = question_answering.QuestionAnsweringConfig(
-        model=self._encoder_config,
-        train_data=self._train_data_config)
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config())
    task = question_answering.QuestionAnsweringTask(config)
    model = task.build_model()
    model = task.compile_model(
@@ -121,8 +155,9 @@ class QuestionAnsweringTaskTest(tf.test.TestCase):
    hub_module_url = self._export_bert_tfhub()
    config = question_answering.QuestionAnsweringConfig(
        hub_module_url=hub_module_url,
-        model=self._encoder_config,
-        train_data=self._train_data_config)
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config())
    self._run_task(config)



--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -14,39 +14,50 @@
 # limitations under the License.
 # ==============================================================================
 """Sentence prediction (classification) task."""
+from typing import List, Union
+
 from absl import logging
 import dataclasses
 import numpy as np
+import orbit
 from scipy import stats
 from sklearn import metrics as sklearn_metrics
 import tensorflow as tf
 import tensorflow_hub as hub

 from official.core import base_task
+from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
-from official.nlp.configs import bert
-from official.nlp.data import sentence_prediction_dataloader
-from official.nlp.modeling import losses as loss_lib
+from official.nlp.configs import encoders
+from official.nlp.data import data_loader_factory
+from official.nlp.modeling import models
 from official.nlp.tasks import utils


+METRIC_TYPES = frozenset(
+    ['accuracy', 'matthews_corrcoef', 'pearson_spearman_corr'])
+
+
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A classifier/regressor configuration."""
+  num_classes: int = 0
+  use_encoder_pooler: bool = False
+  encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+
+
 @dataclasses.dataclass
 class SentencePredictionConfig(cfg.TaskConfig):
  """The model config."""
  # At most one of `init_checkpoint` and `hub_module_url` can
  # be specified.
  init_checkpoint: str = ''
+  init_cls_pooler: bool = False
  hub_module_url: str = ''
  metric_type: str = 'accuracy'
-  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(
-      num_masked_tokens=0,  # No masked language modeling head.
-      cls_heads=[
-          bert.ClsHeadConfig(
-              inner_dim=768,
-              num_classes=3,
-              dropout_rate=0.1,
-              name='sentence_prediction')
-      ])
+  # Defines the concrete model config at instantiation time.
+  model: ModelConfig = ModelConfig()
  train_data: cfg.DataConfig = cfg.DataConfig()
  validation_data: cfg.DataConfig = cfg.DataConfig()

@@ -55,34 +66,45 @@ class SentencePredictionConfig(cfg.TaskConfig):
 class SentencePredictionTask(base_task.Task):
  """Task object for sentence_prediction."""

-  def __init__(self, params=cfg.TaskConfig):
-    super(SentencePredictionTask, self).__init__(params)
+  def __init__(self, params=cfg.TaskConfig, logging_dir=None):
+    super(SentencePredictionTask, self).__init__(params, logging_dir)
    if params.hub_module_url and params.init_checkpoint:
      raise ValueError('At most one of `hub_module_url` and '
-                       '`pretrain_checkpoint_dir` can be specified.')
+                       '`init_checkpoint` can be specified.')
    if params.hub_module_url:
      self._hub_module = hub.load(params.hub_module_url)
    else:
      self._hub_module = None
+
+    if params.metric_type not in METRIC_TYPES:
+      raise ValueError('Invalid metric_type: {}'.format(params.metric_type))
    self.metric_type = params.metric_type

  def build_model(self):
    if self._hub_module:
-      encoder_from_hub = utils.get_encoder_from_hub(self._hub_module)
-      return bert.instantiate_bertpretrainer_from_cfg(
-          self.task_config.model, encoder_network=encoder_from_hub)
+      encoder_network = utils.get_encoder_from_hub(self._hub_module)
    else:
-      return bert.instantiate_bertpretrainer_from_cfg(self.task_config.model)
+      encoder_network = encoders.instantiate_encoder_from_cfg(
+          self.task_config.model.encoder)
+
+    # Currently, we only support bert-style sentence prediction finetuning.
+    return models.BertClassifier(
+        network=encoder_network,
+        num_classes=self.task_config.model.num_classes,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=self.task_config.model.encoder.initializer_range),
+        use_encoder_pooler=self.task_config.model.use_encoder_pooler)

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
-        labels=labels,
-        predictions=tf.nn.log_softmax(
-            tf.cast(model_outputs['sentence_prediction'], tf.float32), axis=-1))
+    if self.task_config.model.num_classes == 1:
+      loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
+    else:
+      loss = tf.keras.losses.sparse_categorical_crossentropy(
+          labels, tf.cast(model_outputs, tf.float32), from_logits=True)

    if aux_losses:
      loss += tf.add_n(aux_losses)
-    return loss
+    return tf.reduce_mean(loss)

  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
@@ -94,8 +116,12 @@ class SentencePredictionTask(base_task.Task):
            input_word_ids=dummy_ids,
            input_mask=dummy_ids,
            input_type_ids=dummy_ids)
-        y = tf.ones((1, 1), dtype=tf.int32)
-        return (x, y)
+
+        if self.task_config.model.num_classes == 1:
+          y = tf.zeros((1,), dtype=tf.float32)
+        else:
+          y = tf.zeros((1, 1), dtype=tf.int32)
+        return x, y

      dataset = tf.data.Dataset.range(1)
      dataset = dataset.repeat()
@@ -103,20 +129,23 @@ class SentencePredictionTask(base_task.Task):
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset

-    return sentence_prediction_dataloader.SentencePredictionDataLoader(
-        params).load(input_context)
+    return data_loader_factory.get_data_loader(params).load(input_context)

  def build_metrics(self, training=None):
    del training
-    metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
+    if self.task_config.model.num_classes == 1:
+      metrics = [tf.keras.metrics.MeanSquaredError()]
+    else:
+      metrics = [
+          tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
    return metrics

  def process_metrics(self, metrics, labels, model_outputs):
    for metric in metrics:
-      metric.update_state(labels, model_outputs['sentence_prediction'])
+      metric.update_state(labels, model_outputs)

  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    compiled_metrics.update_state(labels, model_outputs['sentence_prediction'])
+    compiled_metrics.update_state(labels, model_outputs)

  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
    if self.metric_type == 'accuracy':
@@ -126,27 +155,27 @@ class SentencePredictionTask(base_task.Task):
    outputs = self.inference_step(features, model)
    loss = self.build_losses(
        labels=labels, model_outputs=outputs, aux_losses=model.losses)
+    logs = {self.loss: loss}
    if self.metric_type == 'matthews_corrcoef':
-      return {
-          self.loss:
-              loss,
+      logs.update({
          'sentence_prediction':
-              tf.expand_dims(
-                  tf.math.argmax(outputs['sentence_prediction'], axis=1),
-                  axis=0),
+              tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=0),
          'labels':
              labels,
-      }
+      })
    if self.metric_type == 'pearson_spearman_corr':
-      return {
-          self.loss: loss,
-          'sentence_prediction': outputs['sentence_prediction'],
+      logs.update({
+          'sentence_prediction': outputs,
          'labels': labels,
-      }
+      })
+    return logs

  def aggregate_logs(self, state=None, step_outputs=None):
+    if self.metric_type == 'accuracy':
+      return None
    if state is None:
      state = {'sentence_prediction': [], 'labels': []}
+    # TODO(b/160712818): Add support for concatenating partial batches.
    state['sentence_prediction'].append(
        np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']],
                       axis=0))
@@ -155,15 +184,21 @@ class SentencePredictionTask(base_task.Task):
    return state

  def reduce_aggregated_logs(self, aggregated_logs):
-    if self.metric_type == 'matthews_corrcoef':
+    if self.metric_type == 'accuracy':
+      return None
+    elif self.metric_type == 'matthews_corrcoef':
      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
      return {
          self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels)
      }
-    if self.metric_type == 'pearson_spearman_corr':
+    elif self.metric_type == 'pearson_spearman_corr':
      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
      pearson_corr = stats.pearsonr(preds, labels)[0]
      spearman_corr = stats.spearmanr(preds, labels)[0]
      corr_metric = (pearson_corr + spearman_corr) / 2
@@ -178,13 +213,65 @@ class SentencePredictionTask(base_task.Task):
      return

    pretrain2finetune_mapping = {
-        'encoder':
-            model.checkpoint_items['encoder'],
-        'next_sentence.pooler_dense':
-            model.checkpoint_items['sentence_prediction.pooler_dense'],
+        'encoder': model.checkpoint_items['encoder'],
    }
+    # TODO(b/160251903): Investigate why no pooler dense improves finetuning
+    # accuracies.
+    if self.task_config.init_cls_pooler:
+      pretrain2finetune_mapping[
+          'next_sentence.pooler_dense'] = model.checkpoint_items[
+              'sentence_prediction.pooler_dense']
    ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping)
-    status = ckpt.restore(ckpt_dir_or_file)
+    status = ckpt.read(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
-    logging.info('finished loading pretrained checkpoint from %s',
+    logging.info('Finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
+
+
+def predict(task: SentencePredictionTask, params: cfg.DataConfig,
+            model: tf.keras.Model) -> List[Union[int, float]]:
+  """Predicts on the input data.
+
+  Args:
+    task: A `SentencePredictionTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+
+  Returns:
+    A list of predictions with length of `num_examples`. For regression task,
+      each element in the list is the predicted score; for classification task,
+      each element is the predicted class id.
+  """
+  is_regression = task.task_config.model.num_classes == 1
+
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+
+    def _replicated_step(inputs):
+      """Replicated prediction calculation."""
+      x, _ = inputs
+      outputs = task.inference_step(x, model)
+      if is_regression:
+        return outputs
+      else:
+        return tf.argmax(outputs, axis=-1)
+
+    outputs = tf.distribute.get_strategy().run(
+        _replicated_step, args=(next(iterator),))
+    return tf.nest.map_structure(
+        tf.distribute.get_strategy().experimental_local_results, outputs)
+
+  def reduce_fn(state, outputs):
+    """Concatenates model's outputs."""
+    for per_replica_batch_predictions in outputs:
+      state.extend(per_replica_batch_predictions)
+    return state
+
+  loop_fn = orbit.utils.create_loop_fn(predict_step)
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  # Set `num_steps` to -1 to exhaust the dataset.
+  predictions = loop_fn(
+      iter(dataset), num_steps=-1, state=[], reduce_fn=reduce_fn)
+  return predictions
--- a/official/nlp/tasks/sentence_prediction_test.py
+++ b/official/nlp/tasks/sentence_prediction_test.py
@@ -18,33 +18,59 @@ import functools
 import os

 from absl.testing import parameterized
+import numpy as np
 import tensorflow as tf

 from official.nlp.bert import configs
 from official.nlp.bert import export_tfhub
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
+from official.nlp.data import sentence_prediction_dataloader
 from official.nlp.tasks import sentence_prediction


+def _create_fake_dataset(output_path, seq_length, num_classes, num_examples):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+
+  def create_int_feature(values):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+
+  def create_float_feature(values):
+    return tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+
+  for _ in range(num_examples):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+
+    if num_classes == 1:
+      features["label_ids"] = create_float_feature([np.random.random()])
+    else:
+      features["label_ids"] = create_int_feature(
+          [np.random.random_integers(0, num_classes - 1, size=())])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+
+
 class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):

  def setUp(self):
    super(SentencePredictionTaskTest, self).setUp()
-    self._train_data_config = bert.SentencePredictionDataConfig(
-        input_path="dummy", seq_length=128, global_batch_size=1)
+    self._train_data_config = (
+        sentence_prediction_dataloader.SentencePredictionDataConfig(
+            input_path="dummy", seq_length=128, global_batch_size=1))

  def get_model_config(self, num_classes):
-    return bert.BertPretrainerConfig(
+    return sentence_prediction.ModelConfig(
        encoder=encoders.TransformerEncoderConfig(
            vocab_size=30522, num_layers=1),
-        num_masked_tokens=0,
-        cls_heads=[
-            bert.ClsHeadConfig(
-                inner_dim=10,
-                num_classes=num_classes,
-                name="sentence_prediction")
-        ])
+        num_classes=num_classes)

  def _run_task(self, config):
    task = sentence_prediction.SentencePredictionTask(config)
@@ -79,17 +105,52 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
    pretrain_cfg = bert.BertPretrainerConfig(
        encoder=encoders.TransformerEncoderConfig(
            vocab_size=30522, num_layers=1),
-        num_masked_tokens=20,
        cls_heads=[
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=3, name="next_sentence")
        ])
-    pretrain_model = bert.instantiate_bertpretrainer_from_cfg(pretrain_cfg)
+    pretrain_model = bert.instantiate_pretrainer_from_cfg(pretrain_cfg)
    ckpt = tf.train.Checkpoint(
        model=pretrain_model, **pretrain_model.checkpoint_items)
    ckpt.save(config.init_checkpoint)
    task.initialize(model)

+  @parameterized.named_parameters(
+      {
+          "testcase_name": "regression",
+          "num_classes": 1,
+      },
+      {
+          "testcase_name": "classification",
+          "num_classes": 2,
+      },
+  )
+  def test_metrics_and_losses(self, num_classes):
+    config = sentence_prediction.SentencePredictionConfig(
+        init_checkpoint=self.get_temp_dir(),
+        model=self.get_model_config(num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    if num_classes == 1:
+      self.assertIsInstance(metrics[0], tf.keras.metrics.MeanSquaredError)
+    else:
+      self.assertIsInstance(
+          metrics[0], tf.keras.metrics.SparseCategoricalAccuracy)
+
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+
+    logs = task.validation_step(next(iterator), model, metrics=metrics)
+    loss = logs["loss"].numpy()
+    if num_classes == 1:
+      self.assertAlmostEqual(loss, 42.77483, places=3)
+    else:
+      self.assertAlmostEqual(loss, 3.57627e-6, places=3)
+
  @parameterized.parameters(("matthews_corrcoef", 2),
                            ("pearson_spearman_corr", 1))
  def test_np_metrics(self, metric_type, num_classes):
@@ -158,6 +219,35 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
        train_data=self._train_data_config)
    self._run_task(config)

+  @parameterized.named_parameters(("classification", 5), ("regression", 1))
+  def test_prediction(self, num_classes):
+    task_config = sentence_prediction.SentencePredictionConfig(
+        model=self.get_model_config(num_classes=num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(task_config)
+    model = task.build_model()
+
+    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
+    seq_length = 16
+    num_examples = 100
+    _create_fake_dataset(
+        test_data_path,
+        seq_length=seq_length,
+        num_classes=num_classes,
+        num_examples=num_examples)
+
+    test_data_config = (
+        sentence_prediction_dataloader.SentencePredictionDataConfig(
+            input_path=test_data_path,
+            seq_length=seq_length,
+            is_training=False,
+            label_type="int" if num_classes > 1 else "float",
+            global_batch_size=16,
+            drop_remainder=False))
+
+    predictions = sentence_prediction.predict(task, test_data_config, model)
+    self.assertLen(predictions, num_examples)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tasks/tagging.py
+++ b/official/nlp/tasks/tagging.py
@@ -15,33 +15,48 @@
 # ==============================================================================
 """Tagging (e.g., NER/POS) task."""
 import logging
+from typing import List, Optional, Tuple
+
 import dataclasses
+import orbit
+
+from seqeval import metrics as seqeval_metrics
+
 import tensorflow as tf
 import tensorflow_hub as hub

 from official.core import base_task
+from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import encoders
-from official.nlp.data import tagging_data_loader
+from official.nlp.data import data_loader_factory
 from official.nlp.modeling import models
 from official.nlp.tasks import utils


+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A base span labeler configuration."""
+  encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+  head_dropout: float = 0.1
+  head_initializer_range: float = 0.02
+
+
 @dataclasses.dataclass
 class TaggingConfig(cfg.TaskConfig):
  """The model config."""
  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
  init_checkpoint: str = ''
  hub_module_url: str = ''
-  model: encoders.TransformerEncoderConfig = (
-      encoders.TransformerEncoderConfig())
+  model: ModelConfig = ModelConfig()

-  # The number of real labels. Note that a word may be tokenized into
-  # multiple word_pieces tokens, and we asssume the real label id (non-negative)
-  # is assigned to the first token of the word, and a negative label id is
-  # assigned to the remaining tokens. The negative label id will not contribute
-  # to loss and metrics.
-  num_classes: int = 0
+  # The real class names, the order of which should match real label id.
+  # Note that a word may be tokenized into multiple word_pieces tokens, and
+  # we asssume the real label id (non-negative) is assigned to the first token
+  # of the word, and a negative label id is assigned to the remaining tokens.
+  # The negative label id will not contribute to loss and metrics.
+  class_names: Optional[List[str]] = None
  train_data: cfg.DataConfig = cfg.DataConfig()
  validation_data: cfg.DataConfig = cfg.DataConfig()

@@ -70,13 +85,13 @@ def _masked_labels_and_weights(y_true):
 class TaggingTask(base_task.Task):
  """Task object for tagging (e.g., NER or POS)."""

-  def __init__(self, params=cfg.TaskConfig):
-    super(TaggingTask, self).__init__(params)
+  def __init__(self, params=cfg.TaskConfig, logging_dir=None):
+    super(TaggingTask, self).__init__(params, logging_dir)
    if params.hub_module_url and params.init_checkpoint:
      raise ValueError('At most one of `hub_module_url` and '
                       '`init_checkpoint` can be specified.')
-    if params.num_classes == 0:
-      raise ValueError('TaggingConfig.num_classes cannot be 0.')
+    if not params.class_names:
+      raise ValueError('TaggingConfig.class_names cannot be empty.')

    if params.hub_module_url:
      self._hub_module = hub.load(params.hub_module_url)
@@ -88,14 +103,14 @@ class TaggingTask(base_task.Task):
      encoder_network = utils.get_encoder_from_hub(self._hub_module)
    else:
      encoder_network = encoders.instantiate_encoder_from_cfg(
-          self.task_config.model)
+          self.task_config.model.encoder)

    return models.BertTokenClassifier(
        network=encoder_network,
-        num_classes=self.task_config.num_classes,
+        num_classes=len(self.task_config.class_names),
        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=self.task_config.model.initializer_range),
-        dropout_rate=self.task_config.model.dropout_rate,
+            stddev=self.task_config.model.head_initializer_range),
+        dropout_rate=self.task_config.model.head_dropout,
        output='logits')

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
@@ -108,7 +123,7 @@ class TaggingTask(base_task.Task):
    loss = tf.math.divide_no_nan(numerator_loss, denominator_loss)
    return loss

-  def build_inputs(self, params, input_context=None):
+  def build_inputs(self, params: cfg.DataConfig, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    if params.input_path == 'dummy':

@@ -123,7 +138,7 @@ class TaggingTask(base_task.Task):
        y = tf.random.uniform(
            shape=(1, params.seq_length),
            minval=-1,
-            maxval=self.task_config.num_classes,
+            maxval=len(self.task_config.class_names),
            dtype=tf.dtypes.int32)
        return (x, y)

@@ -133,22 +148,72 @@ class TaggingTask(base_task.Task):
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset

-    dataset = tagging_data_loader.TaggingDataLoader(params).load(input_context)
-    return dataset
+    return data_loader_factory.get_data_loader(params).load(input_context)

-  def build_metrics(self, training=None):
-    del training
-    # TODO(chendouble): evaluate using seqeval's f1/precision/recall.
-    return [tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')]
+  def inference_step(self, inputs, model: tf.keras.Model):
+    """Performs the forward step."""
+    logits = model(inputs, training=False)
+    return {'logits': logits, 'predict_ids': tf.argmax(logits, axis=-1)}

-  def process_metrics(self, metrics, labels, model_outputs):
-    masked_labels, masked_weights = _masked_labels_and_weights(labels)
-    for metric in metrics:
-      metric.update_state(masked_labels, model_outputs, masked_weights)
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    """Validatation step.

-  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    masked_labels, masked_weights = _masked_labels_and_weights(labels)
-    compiled_metrics.update_state(masked_labels, model_outputs, masked_weights)
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+    outputs = self.inference_step(features, model)
+    loss = self.build_losses(labels=labels, model_outputs=outputs['logits'])
+
+    # Negative label ids are padding labels which should be ignored.
+    real_label_index = tf.where(tf.greater_equal(labels, 0))
+    predict_ids = tf.gather_nd(outputs['predict_ids'], real_label_index)
+    label_ids = tf.gather_nd(labels, real_label_index)
+    return {
+        self.loss: loss,
+        'predict_ids': predict_ids,
+        'label_ids': label_ids,
+    }
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    """Aggregates over logs returned from a validation step."""
+    if state is None:
+      state = {'predict_class': [], 'label_class': []}
+
+    def id_to_class_name(batched_ids):
+      class_names = []
+      for per_example_ids in batched_ids:
+        class_names.append([])
+        for per_token_id in per_example_ids.numpy().tolist():
+          class_names[-1].append(self.task_config.class_names[per_token_id])
+
+      return class_names
+
+    # Convert id to class names, because `seqeval_metrics` relies on the class
+    # name to decide IOB tags.
+    state['predict_class'].extend(id_to_class_name(step_outputs['predict_ids']))
+    state['label_class'].extend(id_to_class_name(step_outputs['label_ids']))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs):
+    """Reduces aggregated logs over validation steps."""
+    label_class = aggregated_logs['label_class']
+    predict_class = aggregated_logs['predict_class']
+    return {
+        'f1':
+            seqeval_metrics.f1_score(label_class, predict_class),
+        'precision':
+            seqeval_metrics.precision_score(label_class, predict_class),
+        'recall':
+            seqeval_metrics.recall_score(label_class, predict_class),
+        'accuracy':
+            seqeval_metrics.accuracy_score(label_class, predict_class),
+    }

  def initialize(self, model):
    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
@@ -161,5 +226,69 @@ class TaggingTask(base_task.Task):
    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
    status = ckpt.restore(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
-    logging.info('finished loading pretrained checkpoint from %s',
+    logging.info('Finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
+
+
+def predict(task: TaggingTask, params: cfg.DataConfig,
+            model: tf.keras.Model) -> Tuple[List[List[int]], List[int]]:
+  """Predicts on the input data.
+
+  Args:
+    task: A `TaggingTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+
+  Returns:
+    A tuple of `predict_ids` and `sentence_ids`, which are list with length
+      of `num_examples`. Each element in `predict_ids` is a sequence of
+      predicted per-word label id, and each element in `sentence_ids` is the
+      sentence id of the corresponding example.
+  """
+
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+
+    def _replicated_step(inputs):
+      """Replicated prediction calculation."""
+      x, y = inputs
+      sentence_ids = x.pop('sentence_id')
+      outputs = task.inference_step(x, model)
+      predict_ids = outputs['predict_ids']
+      label_mask = tf.greater_equal(y, 0)
+      return dict(
+          predict_ids=predict_ids,
+          label_mask=label_mask,
+          sentence_ids=sentence_ids)
+
+    outputs = tf.distribute.get_strategy().run(
+        _replicated_step, args=(next(iterator),))
+    return tf.nest.map_structure(
+        tf.distribute.get_strategy().experimental_local_results, outputs)
+
+  def reduce_fn(state, outputs):
+    """Concatenates model's outputs."""
+    cur_predict_ids, cur_sentence_ids = state
+    for batch_predict_ids, batch_label_mask, batch_sentence_ids in zip(
+        outputs['predict_ids'], outputs['label_mask'],
+        outputs['sentence_ids']):
+      for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip(
+          batch_predict_ids.numpy(), batch_label_mask.numpy(),
+          batch_sentence_ids.numpy()):
+        cur_sentence_ids.append(tmp_sentence_id)
+        cur_predict_ids.append([])
+        assert len(tmp_predict_ids) == len(tmp_label_mask)
+        for i in range(len(tmp_predict_ids)):
+          # Skip the padding label.
+          if tmp_label_mask[i]:
+            cur_predict_ids[-1].append(tmp_predict_ids[i])
+    return cur_predict_ids, cur_sentence_ids
+
+  loop_fn = orbit.utils.create_loop_fn(predict_step)
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  # Set `num_steps` to -1 to exhaust the dataset.
+  predict_ids, sentence_ids = loop_fn(
+      iter(dataset), num_steps=-1, state=([], []), reduce_fn=reduce_fn)
+  return predict_ids, sentence_ids
--- a/official/nlp/tasks/tagging_test.py
+++ b/official/nlp/tasks/tagging_test.py
@@ -16,22 +16,46 @@
 """Tests for official.nlp.tasks.tagging."""
 import functools
 import os
+import numpy as np
 import tensorflow as tf

 from official.nlp.bert import configs
 from official.nlp.bert import export_tfhub
-from official.nlp.configs import bert
 from official.nlp.configs import encoders
+from official.nlp.data import tagging_data_loader
 from official.nlp.tasks import tagging


+def _create_fake_dataset(output_path, seq_length, num_labels, num_examples):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+
+  def create_int_feature(values):
+    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return f
+
+  for i in range(num_examples):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+    features["label_ids"] = create_int_feature(
+        np.random.random_integers(-1, num_labels - 1, size=(seq_length)))
+    features["sentence_id"] = create_int_feature([i])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+
+
 class TaggingTest(tf.test.TestCase):

  def setUp(self):
    super(TaggingTest, self).setUp()
    self._encoder_config = encoders.TransformerEncoderConfig(
        vocab_size=30522, num_layers=1)
-    self._train_data_config = bert.TaggingDataConfig(
+    self._train_data_config = tagging_data_loader.TaggingDataConfig(
        input_path="dummy", seq_length=128, global_batch_size=1)

  def _run_task(self, config):
@@ -56,9 +80,9 @@ class TaggingTest(tf.test.TestCase):

    config = tagging.TaggingConfig(
        init_checkpoint=saved_path,
-        model=self._encoder_config,
+        model=tagging.ModelConfig(encoder=self._encoder_config),
        train_data=self._train_data_config,
-        num_classes=3)
+        class_names=["O", "B-PER", "I-PER"])
    task = tagging.TaggingTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
@@ -72,9 +96,9 @@ class TaggingTest(tf.test.TestCase):

  def test_task_with_fit(self):
    config = tagging.TaggingConfig(
-        model=self._encoder_config,
+        model=tagging.ModelConfig(encoder=self._encoder_config),
        train_data=self._train_data_config,
-        num_classes=3)
+        class_names=["O", "B-PER", "I-PER"])

    task = tagging.TaggingTask(config)
    model = task.build_model()
@@ -115,11 +139,59 @@ class TaggingTest(tf.test.TestCase):
    hub_module_url = self._export_bert_tfhub()
    config = tagging.TaggingConfig(
        hub_module_url=hub_module_url,
-        model=self._encoder_config,
-        num_classes=4,
+        class_names=["O", "B-PER", "I-PER"],
        train_data=self._train_data_config)
    self._run_task(config)

+  def test_seqeval_metrics(self):
+    config = tagging.TaggingConfig(
+        model=tagging.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    dataset = task.build_inputs(config.train_data)
+
+    iterator = iter(dataset)
+    strategy = tf.distribute.get_strategy()
+    distributed_outputs = strategy.run(
+        functools.partial(task.validation_step, model=model),
+        args=(next(iterator),))
+    outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                    distributed_outputs)
+    aggregated = task.aggregate_logs(step_outputs=outputs)
+    aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
+    self.assertCountEqual({"f1", "precision", "recall", "accuracy"},
+                          task.reduce_aggregated_logs(aggregated).keys())
+
+  def test_predict(self):
+    task_config = tagging.TaggingConfig(
+        model=tagging.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(task_config)
+    model = task.build_model()
+
+    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
+    seq_length = 16
+    num_examples = 100
+    _create_fake_dataset(
+        test_data_path,
+        seq_length=seq_length,
+        num_labels=len(task_config.class_names),
+        num_examples=num_examples)
+    test_data_config = tagging_data_loader.TaggingDataConfig(
+        input_path=test_data_path,
+        seq_length=seq_length,
+        is_training=False,
+        global_batch_size=16,
+        drop_remainder=False,
+        include_sentence_id=True)
+
+    predict_ids, sentence_ids = tagging.predict(task, test_data_config, model)
+    self.assertLen(predict_ids, num_examples)
+    self.assertLen(sentence_ids, num_examples)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/transformer/beam_search.py
+++ b/official/nlp/transformer/beam_search.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Beam search in TF v2."""
-
-import tensorflow as tf
-
-from official.nlp.transformer import beam_search_v1 as v1
-
-_StateKeys = v1._StateKeys  # pylint: disable=protected-access
-
-
-class SequenceBeamSearchV2(v1.SequenceBeamSearch):
-  """Implementation of beam search loop in v2."""
-
-  def search(self, initial_ids, initial_cache):
-    """Beam search for sequences with highest scores."""
-    state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
-
-    finished_state = tf.nest.map_structure(
-        tf.stop_gradient,
-        tf.while_loop(self._continue_search,
-                      self._search_step,
-                      loop_vars=[state],
-                      shape_invariants=[state_shapes],
-                      parallel_iterations=1))
-    finished_state = finished_state[0]
-
-    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
-    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
-    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
-    finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
-    finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
-
-    # 2.0 changes tf.where behavior. Should make parameters broadcastable.
-    finished_cond = tf.reduce_any(finished_flags, 1, name="finished_cond")
-    seq_cond = _expand_to_same_rank(finished_cond, finished_seq)
-    score_cond = _expand_to_same_rank(finished_cond, finished_scores)
-
-    # Account for corner case where there are no finished sequences for a
-    # particular batch item. In that case, return alive sequences for that batch
-    # item.
-    finished_seq = tf.where(seq_cond, finished_seq, alive_seq)
-    finished_scores = tf.where(
-        score_cond, finished_scores, alive_log_probs)
-    return finished_seq, finished_scores
-
-
-def sequence_beam_search(symbols_to_logits_fn,
-                         initial_ids,
-                         initial_cache,
-                         vocab_size,
-                         beam_size,
-                         alpha,
-                         max_decode_length,
-                         eos_id,
-                         padded_decode=False,
-                         dtype="float32"):
-  """Search for sequence of subtoken ids with the largest probability.
-
-  Args:
-    symbols_to_logits_fn: A function that takes in ids, index, and cache as
-      arguments. The passed in arguments will have shape:
-        ids -> A tensor with shape [batch_size * beam_size, index].
-        index -> A scalar.
-        cache -> A nested dictionary of tensors [batch_size * beam_size, ...].
-      The function must return a tuple of logits and new cache:
-        logits -> A tensor with shape [batch * beam_size, vocab_size].
-        new cache -> A nested dictionary with the same shape/structure as the
-          inputted cache.
-    initial_ids: An int32 tensor with shape [batch_size]. Starting ids for
-      each batch item.
-    initial_cache: A dictionary, containing starting decoder variables
-      information.
-    vocab_size: An integer, the size of tokens.
-    beam_size: An integer, the number of beams.
-    alpha: A float, defining the strength of length normalization.
-    max_decode_length: An integer, the maximum length to decoded a sequence.
-    eos_id: An integer, ID of eos token, used to determine when a sequence has
-      finished.
-    padded_decode: A bool, indicating if max_sequence_length padding is used
-      for beam search.
-    dtype: A tensorflow data type used for score computation. The default is
-      tf.float32.
-
-  Returns:
-    Top decoded sequences [batch_size, beam_size, max_decode_length]
-    sequence scores [batch_size, beam_size]
-  """
-  batch_size = (
-      initial_ids.shape.as_list()[0] if padded_decode else
-      tf.shape(initial_ids)[0])
-  sbs = SequenceBeamSearchV2(symbols_to_logits_fn, vocab_size, batch_size,
-                             beam_size, alpha, max_decode_length, eos_id,
-                             padded_decode, dtype)
-  return sbs.search(initial_ids, initial_cache)
-
-
-def _expand_to_same_rank(tensor, target):
-  """Expands a given tensor to target's rank to be broadcastable.
-
-  Args:
-    tensor: input tensor to tile. Shape: [b, d1, ..., da]
-    target: target tensor. Shape: [b, d1, ..., da, ..., dn]
-
-  Returns:
-    Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
-
-  Raises:
-    ValueError, if the shape rank of rank tensor/target is None.
-  """
-  if tensor.shape.rank is None:
-    raise ValueError("Expect rank for tensor shape, but got None.")
-  if target.shape.rank is None:
-    raise ValueError("Expect rank for target shape, but got None.")
-
-  with tf.name_scope("expand_rank"):
-    diff_rank = target.shape.rank - tensor.shape.rank
-    for _ in range(diff_rank):
-      tensor = tf.expand_dims(tensor, -1)
-    return tensor
--- a/official/nlp/transformer/beam_search_v1.py
+++ b/official/nlp/transformer/beam_search_v1.py
@@ -13,126 +13,18 @@
 # limitations under the License.
 # ==============================================================================
 """Beam search to find the translated sequence with the highest probability.
-
-Source implementation from Tensor2Tensor:
-https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/beam_search.py
 """

-import numpy as np
 import tensorflow.compat.v1 as tf
-from tensorflow.python.util import nest
-
-
-def inf(dtype):
-  """Returns a value close to infinity, but is still finite in `dtype`.
-
-  This is useful to get a very large value that is still zero when multiplied by
-  zero. The floating-point "Inf" value is NaN when multiplied by zero.
-
-  Args:
-    dtype: A dtype. The returned value will be finite when casted to this dtype.
-
-  Returns:
-    A very large value.
-  """
-  if dtype == "float32" or dtype == "bfloat16":
-    return 1e7
-  elif dtype == "float16":
-    # Disable no-member lint error, as the linter thinks np.float16 does not
-    # exist for some reason.
-    return np.finfo(np.float16).max  # pylint: disable=no-member
-  else:
-    raise AssertionError('Invalid dtype: %s' % dtype)
-
-
-class _StateKeys(object):
-  """Keys to dictionary storing the state of the beam search loop."""
-
-  # Variable storing the loop index.
-  CUR_INDEX = "CUR_INDEX"
+from official.nlp.modeling.ops import beam_search

-  # Top sequences that are alive for each batch item. Alive sequences are ones
-  # that have not generated an EOS token. Sequences that reach EOS are marked as
-  # finished and moved to the FINISHED_SEQ tensor.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1]
-  ALIVE_SEQ = "ALIVE_SEQ"
-  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
-  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
-  # Dictionary of cached values for each alive sequence. The cache stores
-  # the encoder output, attention bias, and the decoder attention output from
-  # the previous iteration.
-  ALIVE_CACHE = "ALIVE_CACHE"
+_StateKeys = beam_search._StateKeys  # pylint: disable=protected-access

-  # Top finished sequences for each batch item.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
-  # shorter than CUR_INDEX + 1 are padded with 0s.
-  FINISHED_SEQ = "FINISHED_SEQ"
-  # Scores for each finished sequence. Score = log probability / length norm
-  # Shape [batch_size, beam_size]
-  FINISHED_SCORES = "FINISHED_SCORES"
-  # Flags indicating which sequences in the finished sequences are finished.
-  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
-  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
-  FINISHED_FLAGS = "FINISHED_FLAGS"

-
-class SequenceBeamSearch(object):
+class SequenceBeamSearch(beam_search.SequenceBeamSearch):
  """Implementation of beam search loop."""

-  def __init__(self,
-               symbols_to_logits_fn,
-               vocab_size,
-               batch_size,
-               beam_size,
-               alpha,
-               max_decode_length,
-               eos_id,
-               padded_decode,
-               dtype=tf.float32):
-    """Initialize sequence beam search.
-
-    Args:
-      symbols_to_logits_fn: A function to provide logits, which is the
-        interface to the Transformer model. The passed in arguments are:
-          ids -> A tensor with shape [batch_size * beam_size, index].
-          index -> A scalar.
-          cache -> A nested dictionary of tensors [batch_size * beam_size, ...].
-        The function must return a tuple of logits and the updated cache:
-          logits -> A tensor with shape [batch * beam_size, vocab_size].
-          updated cache -> A nested dictionary with the same structure as the
-            input cache.
-      vocab_size: An integer, the size of the vocabulary, used for topk
-        computation.
-      batch_size: An integer, the decode batch size.
-      beam_size: An integer, number of beams for beam search.
-      alpha: A float, defining the strength of length normalization.
-      max_decode_length: An integer, the maximum number of steps to decode
-        a sequence.
-      eos_id: An integer. ID of end of sentence token.
-      padded_decode: A bool, indicating if max_sequence_length padding is used
-        for beam search.
-      dtype: A tensorflow data type used for score computation. The default is
-        tf.float32.
-    """
-    self.symbols_to_logits_fn = symbols_to_logits_fn
-    self.vocab_size = vocab_size
-    self.batch_size = batch_size
-    self.beam_size = beam_size
-    self.alpha = alpha
-    self.max_decode_length = max_decode_length
-    self.eos_id = eos_id
-    self.padded_decode = padded_decode
-    self.dtype = tf.as_dtype(dtype)
-
-  def search(self, initial_ids, initial_cache):
-    """Beam search for sequences with highest scores."""
-    state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
-
-    finished_state = tf.while_loop(
-        self._continue_search, self._search_step, loop_vars=[state],
-        shape_invariants=[state_shapes], parallel_iterations=1, back_prop=False)
-    finished_state = finished_state[0]
-
+  def _process_finished_state(self, finished_state):
    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
@@ -148,360 +40,6 @@ class SequenceBeamSearch(object):
        tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
    return finished_seq, finished_scores

-  def _create_initial_state(self, initial_ids, initial_cache):
-    """Return initial state dictionary and its shape invariants.
-
-    Args:
-      initial_ids: initial ids to pass into the symbols_to_logits_fn.
-        int tensor with shape [batch_size, 1]
-      initial_cache: dictionary storing values to be passed into the
-        symbols_to_logits_fn.
-
-    Returns:
-        state and shape invariant dictionaries with keys from _StateKeys
-    """
-    for key, value in initial_cache.items():
-      for inner_value in nest.flatten(value):
-        if inner_value.dtype != self.dtype:
-          raise TypeError(
-              "initial_cache element for key '%s' has dtype %s that does not "
-              "match SequenceBeamSearch's dtype of %s. Value: %s" %
-              (key, value.dtype.name, self.dtype.name, inner_value))
-
-    # Current loop index (starts at 0)
-    cur_index = tf.constant(0)
-
-    # Create alive sequence with shape [batch_size, beam_size, 1]
-    alive_seq = _expand_to_beam_size(initial_ids, self.beam_size)
-    alive_seq = tf.expand_dims(alive_seq, axis=2)
-    if self.padded_decode:
-      alive_seq = tf.tile(alive_seq, [1, 1, self.max_decode_length + 1])
-
-    # Create tensor for storing initial log probabilities.
-    # Assume initial_ids are prob 1.0
-    initial_log_probs = tf.constant(
-        [[0.] + [-float("inf")] * (self.beam_size - 1)], dtype=self.dtype)
-    alive_log_probs = tf.tile(initial_log_probs, [self.batch_size, 1])
-
-    # Expand all values stored in the dictionary to the beam size, so that each
-    # beam has a separate cache.
-    alive_cache = nest.map_structure(
-        lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
-
-    # Initialize tensor storing finished sequences with filler values.
-    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
-
-    # Set scores of the initial finished seqs to negative infinity.
-    finished_scores = tf.ones([self.batch_size, self.beam_size],
-                              dtype=self.dtype) * -inf(self.dtype)
-
-    # Initialize finished flags with all False values.
-    finished_flags = tf.zeros([self.batch_size, self.beam_size], tf.bool)
-
-    # Create state dictionary
-    state = {
-        _StateKeys.CUR_INDEX: cur_index,
-        _StateKeys.ALIVE_SEQ: alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
-        _StateKeys.ALIVE_CACHE: alive_cache,
-        _StateKeys.FINISHED_SEQ: finished_seq,
-        _StateKeys.FINISHED_SCORES: finished_scores,
-        _StateKeys.FINISHED_FLAGS: finished_flags
-    }
-
-    # Create state invariants for each value in the state dictionary. Each
-    # dimension must be a constant or None. A None dimension means either:
-    #   1) the dimension's value is a tensor that remains the same but may
-    #      depend on the input sequence to the model (e.g. batch size).
-    #   2) the dimension may have different values on different iterations.
-    if self.padded_decode:
-      state_shape_invariants = {
-          _StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ:
-              tf.TensorShape(
-                  [self.batch_size, self.beam_size,
-                   self.max_decode_length + 1]),
-          _StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([self.batch_size, self.beam_size]),
-          _StateKeys.ALIVE_CACHE:
-              nest.map_structure(_get_shape, alive_cache),
-          _StateKeys.FINISHED_SEQ:
-              tf.TensorShape(
-                  [self.batch_size, self.beam_size,
-                   self.max_decode_length + 1]),
-          _StateKeys.FINISHED_SCORES:
-              tf.TensorShape([self.batch_size, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([self.batch_size, self.beam_size])
-      }
-    else:
-      state_shape_invariants = {
-          _StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ:
-              tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([None, self.beam_size]),
-          _StateKeys.ALIVE_CACHE:
-              nest.map_structure(_get_shape_keep_last_dim, alive_cache),
-          _StateKeys.FINISHED_SEQ:
-              tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.FINISHED_SCORES:
-              tf.TensorShape([None, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([None, self.beam_size])
-      }
-
-    return state, state_shape_invariants
-
-  def _continue_search(self, state):
-    """Return whether to continue the search loop.
-
-    The loops should terminate when
-      1) when decode length has been reached, or
-      2) when the worst score in the finished sequences is better than the best
-         score in the alive sequences (i.e. the finished sequences are provably
-         unchanging)
-
-    Args:
-      state: A dictionary with the current loop state.
-
-    Returns:
-      Bool tensor with value True if loop should continue, False if loop should
-      terminate.
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
-    finished_scores = state[_StateKeys.FINISHED_SCORES]
-    finished_flags = state[_StateKeys.FINISHED_FLAGS]
-
-    not_at_max_decode_length = tf.less(i, self.max_decode_length)
-
-    # Calculate largest length penalty (the larger penalty, the better score).
-    max_length_norm = _length_normalization(self.alpha, self.max_decode_length,
-                                            dtype=self.dtype)
-    # Get the best possible scores from alive sequences.
-    best_alive_scores = alive_log_probs[:, 0] / max_length_norm
-
-    # Compute worst score in finished sequences for each batch element
-    finished_scores *= tf.cast(finished_flags,
-                               self.dtype)  # set filler scores to zero
-    lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
-
-    # If there are no finished sequences in a batch element, then set the lowest
-    # finished score to -INF for that element.
-    finished_batches = tf.reduce_any(finished_flags, 1)
-    lowest_finished_scores += ((1.0 -
-                                tf.cast(finished_batches, self.dtype)) *
-                               -inf(self.dtype))
-
-    worst_finished_score_better_than_best_alive_score = tf.reduce_all(
-        tf.greater(lowest_finished_scores, best_alive_scores)
-    )
-
-    return tf.logical_and(
-        not_at_max_decode_length,
-        tf.logical_not(worst_finished_score_better_than_best_alive_score)
-    )
-
-  def _search_step(self, state):
-    """Beam search loop body.
-
-    Grow alive sequences by a single ID. Sequences that have reached the EOS
-    token are marked as finished. The alive and finished sequences with the
-    highest log probabilities and scores are returned.
-
-    A sequence's finished score is calculating by dividing the log probability
-    by the length normalization factor. Without length normalization, the
-    search is more likely to return shorter sequences.
-
-    Args:
-      state: A dictionary with the current loop state.
-
-    Returns:
-      new state dictionary.
-    """
-    # Grow alive sequences by one token.
-    new_seq, new_log_probs, topk_ids, new_cache = self._grow_alive_seq(state)
-    new_finished_flags = tf.equal(topk_ids, self.eos_id)
-    # Collect top beam_size alive sequences
-    alive_state = self._get_new_alive_state(new_seq, new_log_probs,
-                                            new_finished_flags, new_cache)
-
-    # Combine newly finished sequences with existing finished sequences, and
-    # collect the top k scoring sequences.
-    finished_state = self._get_new_finished_state(state, new_seq, new_log_probs,
-                                                  new_finished_flags)
-
-    # Increment loop index and create new state dictionary
-    new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
-    new_state.update(alive_state)
-    new_state.update(finished_state)
-    return [new_state]
-
-  def _grow_alive_seq(self, state):
-    """Grow alive sequences by one token, and collect top 2*beam_size sequences.
-
-    2*beam_size sequences are collected because some sequences may have reached
-    the EOS token. 2*beam_size ensures that at least beam_size sequences are
-    still alive.
-
-    Args:
-      state: A dictionary with the current loop state.
-    Returns:
-      Tuple of
-      (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
-       Scores of returned sequences [batch_size, 2 * beam_size],
-       New alive cache, for each of the 2 * beam_size sequences)
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    alive_seq = state[_StateKeys.ALIVE_SEQ]
-    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
-    alive_cache = state[_StateKeys.ALIVE_CACHE]
-
-    beams_to_keep = 2 * self.beam_size
-
-    # Get logits for the next candidate IDs for the alive sequences. Get the new
-    # cache values at the same time.
-    if self.padded_decode:
-      flat_ids = tf.reshape(
-          tf.slice(alive_seq, [0, 0, i], [self.batch_size, self.beam_size, 1]),
-          [self.batch_size * self.beam_size, -1])
-    else:
-      flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
-    flat_cache = nest.map_structure(_flatten_beam_dim, alive_cache)
-
-    flat_logits, flat_cache = self.symbols_to_logits_fn(flat_ids, i, flat_cache)
-
-    # Unflatten logits to shape [batch_size, beam_size, vocab_size]
-    logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size)
-    new_cache = nest.map_structure(
-        lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size),
-        flat_cache)
-
-    # Convert logits to normalized log probs
-    candidate_log_probs = _log_prob_from_logits(logits)
-
-    # Calculate new log probabilities if each of the alive sequences were
-    # extended # by the the candidate IDs.
-    # Shape [batch_size, beam_size, vocab_size]
-    log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
-
-    # Each batch item has beam_size * vocab_size candidate sequences. For each
-    # batch item, get the k candidates with the highest log probabilities.
-    flat_log_probs = tf.reshape(log_probs,
-                                [-1, self.beam_size * self.vocab_size])
-    topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep)
-
-    # Extract the alive sequences that generate the highest log probabilities
-    # after being extended.
-    topk_beam_indices = topk_indices // self.vocab_size
-    topk_seq, new_cache = _gather_beams(
-        [alive_seq, new_cache], topk_beam_indices, self.batch_size,
-        beams_to_keep)
-
-    # Append the most probable IDs to the topk sequences
-    topk_ids = topk_indices % self.vocab_size
-    if self.padded_decode:
-      topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
-      # TODO(b/145533236, hongkuny): Reverts once TF fix the validation.
-      topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]],
-                                             tf.expand_dims(topk_ids, axis=0))
-      topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
-    else:
-      topk_seq = tf.concat([topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
-    return topk_seq, topk_log_probs, topk_ids, new_cache
-
-  def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags,
-                           new_cache):
-    """Gather the top k sequences that are still alive.
-
-    Args:
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
-      new_log_probs: Log probabilities of new sequences float32 tensor with
-        shape [batch_size, beam_size]
-      new_finished_flags: A boolean Tensor indicates which sequences are live
-        inside the beam.
-      new_cache: Dict of cached values for each sequence.
-
-    Returns:
-      Dictionary with alive keys from _StateKeys:
-        {Top beam_size sequences that are still alive (don't end with eos_id)
-         Log probabilities of top alive sequences
-         Dict cache storing decoder states for top alive sequences}
-    """
-    # To prevent finished sequences from being considered, set log probs to -inf
-    new_log_probs += tf.cast(new_finished_flags, self.dtype) * -inf(self.dtype)
-
-    top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
-        [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size,
-        self.beam_size)
-
-    return {
-        _StateKeys.ALIVE_SEQ: top_alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
-        _StateKeys.ALIVE_CACHE: top_alive_cache
-    }
-
-  def _get_new_finished_state(self, state, new_seq, new_log_probs,
-                              new_finished_flags):
-    """Combine new and old finished sequences, and gather the top k sequences.
-
-    Args:
-      state: A dictionary with the current loop state.
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape [batch_size, beam_size, i + 1]
-      new_log_probs: Log probabilities of new sequences float32 tensor with
-        shape [batch_size, beam_size]
-      new_finished_flags: A boolean Tensor indicates which sequences are live
-        inside the beam.
-
-    Returns:
-      Dictionary with finished keys from _StateKeys:
-        {Top beam_size finished sequences based on score,
-         Scores of finished sequences,
-         Finished flags of finished sequences}
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    finished_seq = state[_StateKeys.FINISHED_SEQ]
-    finished_scores = state[_StateKeys.FINISHED_SCORES]
-    finished_flags = state[_StateKeys.FINISHED_FLAGS]
-
-    # First append a column of 0-ids to finished_seq to increment the length.
-    # New shape of finished_seq: [batch_size, beam_size, i + 1]
-    if not self.padded_decode:
-      finished_seq = tf.concat([
-          finished_seq,
-          tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)
-      ],
-                               axis=2)
-
-    # Calculate new seq scores from log probabilities.
-    length_norm = _length_normalization(self.alpha, i + 1, dtype=self.dtype)
-    new_scores = new_log_probs / length_norm
-
-    # Set the scores of the still-alive seq in new_seq to large negative values.
-    new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) *
-                   -inf(self.dtype))
-
-    # Combine sequences, scores, and flags.
-    finished_seq = tf.concat([finished_seq, new_seq], axis=1)
-    finished_scores = tf.concat([finished_scores, new_scores], axis=1)
-    finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
-
-    # Return the finished sequences with the best scores.
-    top_finished_seq, top_finished_scores, top_finished_flags = (
-        _gather_topk_beams([finished_seq, finished_scores, finished_flags],
-                           finished_scores, self.batch_size, self.beam_size))
-
-    return {
-        _StateKeys.FINISHED_SEQ: top_finished_seq,
-        _StateKeys.FINISHED_SCORES: top_finished_scores,
-        _StateKeys.FINISHED_FLAGS: top_finished_flags
-    }
-

 def sequence_beam_search(
    symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size,
@@ -536,140 +74,6 @@ def sequence_beam_search(
    Top decoded sequences [batch_size, beam_size, max_decode_length]
    sequence scores [batch_size, beam_size]
  """
-  batch_size = (
-      initial_ids.shape.as_list()[0] if padded_decode else
-      tf.shape(initial_ids)[0])
-  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size,
-                           beam_size, alpha, max_decode_length, eos_id,
-                           padded_decode)
+  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, beam_size, alpha,
+                           max_decode_length, eos_id, padded_decode)
  return sbs.search(initial_ids, initial_cache)
-
-
-def _log_prob_from_logits(logits):
-  return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
-
-
-def _length_normalization(alpha, length, dtype=tf.float32):
-  """Return length normalization factor."""
-  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), alpha)
-
-
-def _expand_to_beam_size(tensor, beam_size):
-  """Tiles a given tensor by beam_size.
-
-  Args:
-    tensor: tensor to tile [batch_size, ...]
-    beam_size: How much to tile the tensor by.
-
-  Returns:
-    Tiled tensor [batch_size, beam_size, ...]
-  """
-  tensor = tf.expand_dims(tensor, axis=1)
-  tile_dims = [1] * tensor.shape.ndims
-  tile_dims[1] = beam_size
-
-  return tf.tile(tensor, tile_dims)
-
-
-def _shape_list(tensor):
-  """Return a list of the tensor's shape, and ensure no None values in list."""
-  # Get statically known shape (may contain None's for unknown dimensions)
-  shape = tensor.get_shape().as_list()
-
-  # Ensure that the shape values are not None
-  dynamic_shape = tf.shape(tensor)
-  for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
-    if shape[i] is None:
-      shape[i] = dynamic_shape[i]
-  return shape
-
-
-def _get_shape_keep_last_dim(tensor):
-  shape_list = _shape_list(tensor)
-
-  # Only the last
-  for i in range(len(shape_list) - 1):
-    shape_list[i] = None
-
-  if isinstance(shape_list[-1], tf.Tensor):
-    shape_list[-1] = None
-  return tf.TensorShape(shape_list)
-
-
-def _get_shape(tensor):
-  """Return the shape of the input tensor."""
-  return tf.TensorShape(_shape_list(tensor))
-
-
-def _flatten_beam_dim(tensor):
-  """Reshapes first two dimensions in to single dimension.
-
-  Args:
-    tensor: Tensor to reshape of shape [A, B, ...]
-
-  Returns:
-    Reshaped tensor of shape [A*B, ...]
-  """
-  shape = _shape_list(tensor)
-  shape[0] *= shape[1]
-  shape.pop(1)  # Remove beam dim
-  return tf.reshape(tensor, shape)
-
-
-def _unflatten_beam_dim(tensor, batch_size, beam_size):
-  """Reshapes first dimension back to [batch_size, beam_size].
-
-  Args:
-    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
-    batch_size: Tensor, original batch size.
-    beam_size: int, original beam size.
-
-  Returns:
-    Reshaped tensor of shape [batch_size, beam_size, ...]
-  """
-  shape = _shape_list(tensor)
-  new_shape = [batch_size, beam_size] + shape[1:]
-  return tf.reshape(tensor, new_shape)
-
-
-def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
-  """Gather beams from nested structure of tensors.
-
-  Each tensor in nested represents a batch of beams, where beam refers to a
-  single search state (beam search involves searching through multiple states
-  in parallel).
-
-  This function is used to gather the top beams, specified by
-  beam_indices, from the nested tensors.
-
-  Args:
-    nested: Nested structure (tensor, list, tuple or dict) containing tensors
-      with shape [batch_size, beam_size, ...].
-    beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
-     value in beam_indices must be between [0, beam_size), and are not
-     necessarily unique.
-    batch_size: int size of batch
-    new_beam_size: int number of beams to be pulled from the nested tensors.
-
-  Returns:
-    Nested structure containing tensors with shape
-      [batch_size, new_beam_size, ...]
-  """
-  # Computes the i'th coodinate that contains the batch index for gather_nd.
-  # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
-  batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
-  batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
-
-  # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
-  # with shape [batch_size, beam_size, 2], where the last dimension contains
-  # the (i, j) gathering coordinates.
-  coordinates = tf.stack([batch_pos, beam_indices], axis=2)
-
-  return nest.map_structure(
-      lambda state: tf.gather_nd(state, coordinates), nested)
-
-
-def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
-  """Gather top beams from nested structure."""
-  _, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size)
-  return _gather_beams(nested, topk_indexes, batch_size, beam_size)
--- a/official/nlp/transformer/embedding_layer.py
+++ b/official/nlp/transformer/embedding_layer.py
@@ -43,6 +43,7 @@ class EmbeddingSharedWeights(tf.keras.layers.Layer):
      self.shared_weights = self.add_weight(
          "weights",
          shape=[self.vocab_size, self.hidden_size],
+          dtype=tf.float32,
          initializer=tf.random_normal_initializer(
              mean=0., stddev=self.hidden_size**-0.5))
    super(EmbeddingSharedWeights, self).build(input_shape)

--- a/official/nlp/transformer/optimizer.py
+++ b/official/nlp/transformer/optimizer.py
@@ -18,9 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import numpy as np
 import tensorflow as tf
-K = tf.keras.backend


 class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
@@ -66,72 +64,3 @@ class LearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
        'hidden_size': self.hidden_size,
        'warmup_steps': self.warmup_steps,
    }
-
-
-class LearningRateFn(object):
-  """Creates learning rate function."""
-
-  def __init__(self, learning_rate, hidden_size, warmup_steps):
-    self.learning_rate = learning_rate
-    self.hidden_size = hidden_size
-    self.warmup_steps = float(warmup_steps)
-
-  def __call__(self, global_step):
-    """Calculate learning rate with linear warmup and rsqrt decay."""
-    step = float(global_step)
-    learning_rate = self.learning_rate
-    learning_rate *= (self.hidden_size ** -0.5)
-    # Apply linear warmup
-    learning_rate *= np.minimum(1.0, step / self.warmup_steps)
-    # Apply rsqrt decay
-    learning_rate /= np.sqrt(np.maximum(step, self.warmup_steps))
-    return learning_rate
-
-
-class LearningRateScheduler(tf.keras.callbacks.Callback):
-  """Keras callback to schedule learning rate.
-
-  TODO(tianlin): Refactor this scheduler and LearningRateBatchScheduler in
-  official/resnet/keras/keras_common.py.
-  """
-
-  def __init__(self, schedule, init_steps=None, verbose=False):
-    super(LearningRateScheduler, self).__init__()
-    self.schedule = schedule
-    self.verbose = verbose
-    if init_steps is None:
-      init_steps = 0.0
-    self.steps = float(init_steps)   # Total steps during training.
-
-  def on_epoch_begin(self, epoch, logs=None):
-    if not hasattr(self.model.optimizer, 'lr'):
-      raise ValueError('Optimizer must have a "lr" attribute.')
-    if not hasattr(self.model.optimizer, 'iterations'):
-      raise ValueError('Optimizer must have a "iterations" attribute.')
-
-  def on_train_batch_begin(self, batch, logs=None):
-    """Adjusts learning rate for each train batch."""
-    if self.verbose > 0:
-      iterations = K.get_value(self.model.optimizer.iterations)
-      print('Original iteration %d' % iterations)
-
-    self.steps += 1.0
-    try:  # new API
-      lr = float(K.get_value(self.model.optimizer.lr))
-      lr = self.schedule(self.steps, lr)
-    except TypeError:  # Support for old API for backward compatibility
-      lr = self.schedule(self.steps)
-    if not isinstance(lr, (float, np.float32, np.float64)):
-      raise ValueError('The output of the "schedule" function '
-                       'should be float.')
-    K.set_value(self.model.optimizer.lr, lr)
-    K.set_value(self.model.optimizer.iterations, self.steps)
-
-    if self.verbose > 0:
-      print('Batch %05d Step %05d: LearningRateScheduler setting learning '
-            'rate to %s.' % (batch + 1, self.steps, lr))
-
-  def on_epoch_end(self, epoch, logs=None):
-    logs = logs or {}
-    logs['lr'] = K.get_value(self.model.optimizer.lr)
-    logs['steps'] = self.steps
--- a/official/nlp/transformer/transformer.py
+++ b/official/nlp/transformer/transformer.py
@@ -23,8 +23,8 @@ from __future__ import print_function

 import tensorflow as tf
 from official.nlp.modeling.layers import position_embedding
+from official.nlp.modeling.ops import beam_search
 from official.nlp.transformer import attention_layer
-from official.nlp.transformer import beam_search
 from official.nlp.transformer import embedding_layer
 from official.nlp.transformer import ffn_layer
 from official.nlp.transformer import metrics
@@ -52,7 +52,6 @@ def create_model(params, is_train):
      logits = tf.keras.layers.Lambda(lambda x: x, name="logits",
                                      dtype=tf.float32)(logits)
      model = tf.keras.Model([inputs, targets], logits)
-      # TODO(reedwm): Can we do this loss in float16 instead of float32?
      loss = metrics.transformer_loss(
          logits, targets, label_smoothing, vocab_size)
      model.add_loss(loss)
@@ -238,7 +237,6 @@ class Transformer(tf.keras.Model):
    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
        max_decode_length, dtype=self.params["dtype"])

-    # TODO(b/139770046): Refactor code with better naming of i.
    def symbols_to_logits_fn(ids, i, cache):
      """Generate logits for next potential IDs.


--- a/official/nlp/transformer/transformer_main.py
+++ b/official/nlp/transformer/transformer_main.py
@@ -241,14 +241,13 @@ class TransformerTask(object):
    if params["use_ctl"]:
      train_ds_iterator = iter(train_ds)

-    callbacks = self._create_callbacks(flags_obj.model_dir, 0, params)
+    callbacks = self._create_callbacks(flags_obj.model_dir, params)

    # Only TimeHistory callback is supported for CTL
    if params["use_ctl"]:
      callbacks = [cb for cb in callbacks
                   if isinstance(cb, keras_utils.TimeHistory)]

-    # TODO(b/139418525): Refactor the custom training loop logic.
    @tf.function
    def train_steps(iterator, steps):
      """Training steps function for TPU runs.
@@ -408,14 +407,9 @@ class TransformerTask(object):
    for i in range(length):
      translate.translate_from_input(val_outputs[i], subtokenizer)

-  def _create_callbacks(self, cur_log_dir, init_steps, params):
+  def _create_callbacks(self, cur_log_dir, params):
    """Creates a list of callbacks."""
-    sfunc = optimizer.LearningRateFn(params["learning_rate"],
-                                     params["hidden_size"],
-                                     params["learning_rate_warmup_steps"])
-    scheduler_callback = optimizer.LearningRateScheduler(sfunc, init_steps)
    callbacks = misc.get_callbacks()
-    callbacks.append(scheduler_callback)
    if params["enable_checkpointing"]:
      ckpt_full_path = os.path.join(cur_log_dir, "cp-{epoch:04d}.ckpt")
      callbacks.append(
@@ -427,8 +421,6 @@ class TransformerTask(object):
    """Loads model weights when it is provided."""
    if init_weight_path:
      logging.info("Load weights: {}".format(init_weight_path))
-      # TODO(b/139414977): Having the same variable restoring method for both
-      # TPU and GPU.
      if self.use_tpu:
        checkpoint = tf.train.Checkpoint(
            model=model, optimizer=self._create_optimizer())
@@ -445,7 +437,7 @@ class TransformerTask(object):
        params["learning_rate"], params["hidden_size"],
        params["learning_rate_warmup_steps"])
    opt = tf.keras.optimizers.Adam(
-        lr_schedule if self.use_tpu else params["learning_rate"],
+        lr_schedule,
        params["optimizer_adam_beta1"],
        params["optimizer_adam_beta2"],
        epsilon=params["optimizer_adam_epsilon"])

--- a/official/nlp/transformer/translate.py
+++ b/official/nlp/transformer/translate.py
@@ -181,7 +181,7 @@ def translate_file(model,
      raise ValueError("File output is a directory, will not save outputs to "
                       "file.")
    logging.info("Writing to file %s", output_file)
-    with tf.compat.v1.gfile.Open(output_file, "w") as f:
+    with tf.io.gfile.GFile(output_file, "w") as f:
      for i in sorted_keys:
        f.write("%s\n" % translations[i])


--- a/official/nlp/transformer/utils/metrics.py
+++ b/official/nlp/transformer/utils/metrics.py
@@ -67,7 +67,7 @@ def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
    # Calculate smoothing cross entropy
    with tf.name_scope("smoothing_cross_entropy", values=[logits, labels]):
      confidence = 1.0 - smoothing
-      low_confidence = (1.0 - confidence) / tf.to_float(vocab_size - 1)
+      low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
      soft_targets = tf.one_hot(
          tf.cast(labels, tf.int32),
          depth=vocab_size,
@@ -79,11 +79,11 @@ def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
      # Calculate the best (lowest) possible value of cross entropy, and
      # subtract from the cross entropy loss.
      normalizing_constant = -(
-          confidence * tf.log(confidence) + tf.to_float(vocab_size - 1) *
-          low_confidence * tf.log(low_confidence + 1e-20))
+          confidence * tf.log(confidence) + tf.cast(vocab_size - 1, tf.float32)
+          * low_confidence * tf.log(low_confidence + 1e-20))
      xentropy -= normalizing_constant

-    weights = tf.to_float(tf.not_equal(labels, 0))
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
    return xentropy * weights, weights


@@ -142,24 +142,24 @@ def padded_accuracy(logits, labels):
  """Percentage of times that predictions matches labels on non-0s."""
  with tf.variable_scope("padded_accuracy", values=[logits, labels]):
    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.to_float(tf.not_equal(labels, 0))
-    outputs = tf.to_int32(tf.argmax(logits, axis=-1))
-    padded_labels = tf.to_int32(labels)
-    return tf.to_float(tf.equal(outputs, padded_labels)), weights
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    return tf.cast(tf.equal(outputs, padded_labels), tf.float32), weights


 def padded_accuracy_topk(logits, labels, k):
  """Percentage of times that top-k predictions matches labels on non-0s."""
  with tf.variable_scope("padded_accuracy_topk", values=[logits, labels]):
    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.to_float(tf.not_equal(labels, 0))
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
    effective_k = tf.minimum(k, tf.shape(logits)[-1])
    _, outputs = tf.nn.top_k(logits, k=effective_k)
-    outputs = tf.to_int32(outputs)
-    padded_labels = tf.to_int32(labels)
+    outputs = tf.cast(outputs, tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
    padded_labels = tf.expand_dims(padded_labels, axis=-1)
    padded_labels += tf.zeros_like(outputs)  # Pad to same shape.
-    same = tf.to_float(tf.equal(outputs, padded_labels))
+    same = tf.cast(tf.equal(outputs, padded_labels), tf.float32)
    same_topk = tf.reduce_sum(same, axis=-1)
    return same_topk, weights

@@ -172,10 +172,11 @@ def padded_sequence_accuracy(logits, labels):
  """Percentage of times that predictions matches labels everywhere (non-0)."""
  with tf.variable_scope("padded_sequence_accuracy", values=[logits, labels]):
    logits, labels = _pad_tensors_to_same_length(logits, labels)
-    weights = tf.to_float(tf.not_equal(labels, 0))
-    outputs = tf.to_int32(tf.argmax(logits, axis=-1))
-    padded_labels = tf.to_int32(labels)
-    not_correct = tf.to_float(tf.not_equal(outputs, padded_labels)) * weights
+    weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+    outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
+    padded_labels = tf.cast(labels, tf.int32)
+    not_correct = (tf.cast(tf.not_equal(outputs, padded_labels), tf.float32) *
+                   weights)
    axis = list(range(1, len(outputs.get_shape())))
    correct_seq = 1.0 - tf.minimum(1.0, tf.reduce_sum(not_correct, axis=axis))
    return correct_seq, tf.constant(1.0)
@@ -201,7 +202,7 @@ def bleu_score(logits, labels):
  Returns:
    bleu: int, approx bleu score
  """
-  predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
  # TODO: Look into removing use of py_func
  bleu = tf.py_func(compute_bleu, (labels, predictions), tf.float32)
  return bleu, tf.constant(1.0)
@@ -306,7 +307,7 @@ def rouge_2_fscore(logits, labels):
  Returns:
    rouge2_fscore: approx rouge-2 f1 score.
  """
-  predictions = tf.to_int32(tf.argmax(logits, axis=-1))
+  predictions = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
  # TODO: Look into removing use of py_func
  rouge_2_f_score = tf.py_func(rouge_n, (predictions, labels), tf.float32)
  return rouge_2_f_score, tf.constant(1.0)
@@ -383,7 +384,7 @@ def rouge_l_fscore(predictions, labels):
  Returns:
    rouge_l_fscore: approx rouge-l f1 score.
  """
-  outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+  outputs = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
  rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (outputs, labels),
                               tf.float32)
  return rouge_l_f_score, tf.constant(1.0)

--- a/official/pip_package/setup.py
+++ b/official/pip_package/setup.py
@@ -45,6 +45,9 @@ def _get_requirements():
      os.path.join(os.path.dirname(__file__), '../requirements.txt'), 'r') as f:
    for line in f:
      package_name = line.strip()
+      # Skip empty line or comments starting with "#".
+      if not package_name or package_name[0] == '#':
+        continue
      if package_name.startswith('-e '):
        dependency_links_tmp.append(package_name[3:].strip())
      else:

--- a/official/recommendation/ncf_common.py
+++ b/official/recommendation/ncf_common.py
@@ -94,7 +94,7 @@ def parse_flags(flags_obj):
      "beta2": flags_obj.beta2,
      "epsilon": flags_obj.epsilon,
      "match_mlperf": flags_obj.ml_perf,
-      "epochs_between_evals": FLAGS.epochs_between_evals,
+      "epochs_between_evals": flags_obj.epochs_between_evals,
      "keras_use_ctl": flags_obj.keras_use_ctl,
      "hr_threshold": flags_obj.hr_threshold,
      "stream_files": flags_obj.tpu is not None,

--- a/official/recommendation/ncf_input_pipeline.py
+++ b/official/recommendation/ncf_input_pipeline.py
@@ -25,10 +25,8 @@ import tensorflow.compat.v2 as tf
 # pylint: enable=g-bad-import-order

 from official.recommendation import constants as rconst
-from official.recommendation import movielens
 from official.recommendation import data_pipeline
-
-NUM_SHARDS = 16
+from official.recommendation import movielens


 def create_dataset_from_tf_record_files(input_file_pattern,
@@ -36,32 +34,23 @@ def create_dataset_from_tf_record_files(input_file_pattern,
                                        batch_size,
                                        is_training=True):
  """Creates dataset from (tf)records files for training/evaluation."""
+  if pre_batch_size != batch_size:
+    raise ValueError("Pre-batch ({}) size is not equal to batch "
+                     "size ({})".format(pre_batch_size, batch_size))

  files = tf.data.Dataset.list_files(input_file_pattern, shuffle=is_training)

-  def make_dataset(files_dataset, shard_index):
-    """Returns dataset for sharded tf record files."""
-    if pre_batch_size != batch_size:
-      raise ValueError("Pre-batch ({}) size is not equal to batch "
-                       "size ({})".format(pre_batch_size, batch_size))
-    files_dataset = files_dataset.shard(NUM_SHARDS, shard_index)
-    dataset = files_dataset.interleave(
-        tf.data.TFRecordDataset,
-        num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    decode_fn = functools.partial(
-        data_pipeline.DatasetManager.deserialize,
-        batch_size=pre_batch_size,
-        is_training=is_training)
-    dataset = dataset.map(
-        decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    return dataset
-
-  dataset = tf.data.Dataset.range(NUM_SHARDS)
-  map_fn = functools.partial(make_dataset, files)
-  dataset = dataset.interleave(
-      map_fn,
-      cycle_length=NUM_SHARDS,
+  dataset = files.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=16,
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  decode_fn = functools.partial(
+      data_pipeline.DatasetManager.deserialize,
+      batch_size=pre_batch_size,
+      is_training=is_training)
+  dataset = dataset.map(
+      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset