Merge remote-tracking branch 'upstream/master' into newavarecords

5a2cf36f · Kaushik Shivakumar · 258ddfc3 · a829e648 · 5a2cf36f · 5a2cf36f
Commit 5a2cf36f authored Jul 23, 2020 by Kaushik Shivakumar
20 changed files
--- a/official/nlp/modeling/networks/classification.py
+++ b/official/nlp/modeling/networks/classification.py
@@ -29,6 +29,9 @@ class Classification(tf.keras.Model):
  This network implements a simple classifier head based on a dense layer. If
  num_classes is one, it can be considered as a regression problem.

+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
  Arguments:
    input_width: The innermost dimension of the input tensor to this network.
    num_classes: The number of classes that this network should classify to. If

--- a/official/nlp/modeling/networks/encoder_scaffold.py
+++ b/official/nlp/modeling/networks/encoder_scaffold.py
@@ -49,6 +49,9 @@ class EncoderScaffold(tf.keras.Model):
  If the hidden_cls is not overridden, a default transformer layer will be
  instantiated.

+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
  Arguments:
    pooled_output_dim: The dimension of pooled output.
    pooler_layer_initializer: The initializer for the classification

--- a/official/nlp/modeling/networks/encoder_scaffold_test.py
+++ b/official/nlp/modeling/networks/encoder_scaffold_test.py
@@ -323,6 +323,28 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
    self.assertAllEqual(network.get_config(), new_network.get_config())


+class Embeddings(tf.keras.Model):
+
+  def __init__(self, vocab_size, hidden_size):
+    super().__init__()
+    self.inputs = [
+        tf.keras.layers.Input(
+            shape=(None,), dtype=tf.int32, name="input_word_ids"),
+        tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
+    ]
+    self.attention_mask = layers.SelfAttentionMask()
+    self.embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=hidden_size,
+        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        name="word_embeddings")
+
+  def call(self, inputs):
+    word_ids, mask = inputs
+    word_embeddings = self.embedding_layer(word_ids)
+    return word_embeddings, self.attention_mask([word_embeddings, mask])
+
+
 @keras_parameterized.run_all_keras_modes
 class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):

@@ -334,20 +356,7 @@ class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
    # Build an embedding network to swap in for the default network. This one
    # will have 2 inputs (mask and word_ids) instead of 3, and won't use
    # positional embeddings.
-
-    word_ids = tf.keras.layers.Input(
-        shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
-    mask = tf.keras.layers.Input(
-        shape=(sequence_length,), dtype=tf.int32, name="input_mask")
-    embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=hidden_size,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        name="word_embeddings")
-    word_embeddings = embedding_layer(word_ids)
-    attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])
-    network = tf.keras.Model([word_ids, mask],
-                             [word_embeddings, attention_mask])
+    network = Embeddings(vocab_size, hidden_size)

    hidden_cfg = {
        "num_attention_heads":
@@ -371,8 +380,7 @@ class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=0.02),
        hidden_cfg=hidden_cfg,
-        embedding_cls=network,
-        embedding_data=embedding_layer.embeddings)
+        embedding_cls=network)

    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -390,11 +398,6 @@ class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
    _ = model.predict([word_id_data, mask_data])

-    # Test that we can get the embedding data that we passed to the object. This
-    # is necessary to support standard language model training.
-    self.assertIs(embedding_layer.embeddings,
-                  test_network.get_embedding_table())
-
  def test_serialize_deserialize(self):
    hidden_size = 32
    sequence_length = 21

--- a/official/nlp/modeling/networks/span_labeling.py
+++ b/official/nlp/modeling/networks/span_labeling.py
@@ -27,6 +27,8 @@ class SpanLabeling(tf.keras.Model):
  """Span labeling network head for BERT modeling.

  This network implements a simple single-span labeler based on a dense layer.
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).

  Arguments:
    input_width: The innermost dimension of the input tensor to this network.

--- a/official/nlp/modeling/networks/token_classification.py
+++ b/official/nlp/modeling/networks/token_classification.py
@@ -27,6 +27,8 @@ class TokenClassification(tf.keras.Model):
  """TokenClassification network head for BERT modeling.

  This network implements a simple token classifier head based on a dense layer.
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).

  Arguments:
    input_width: The innermost dimension of the input tensor to this network.

--- a/official/nlp/modeling/networks/transformer_encoder.py
+++ b/official/nlp/modeling/networks/transformer_encoder.py
@@ -39,6 +39,9 @@ class TransformerEncoder(tf.keras.Model):
  in "BERT: Pre-training of Deep Bidirectional Transformers for Language
  Understanding".

+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
  Arguments:
    vocab_size: The size of the token vocabulary.
    hidden_size: The size of the transformer hidden layers.

--- a/official/nlp/modeling/ops/__init__.py
+++ b/official/nlp/modeling/ops/__init__.py
+
--- a/official/nlp/modeling/ops/beam_search.py
+++ b/official/nlp/modeling/ops/beam_search.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Beam search to find the translated sequence with the highest probability."""
+
+import numpy as np
+import tensorflow as tf
+
+
+def inf(dtype):
+  """Returns a value close to infinity, but is still finite in `dtype`.
+
+  This is useful to get a very large value that is still zero when multiplied by
+  zero. The floating-point "Inf" value is NaN when multiplied by zero.
+
+  Args:
+    dtype: A dtype. The returned value will be finite when casted to this dtype.
+
+  Returns:
+    A very large value.
+  """
+  if dtype == "float32" or dtype == "bfloat16":
+    return 1e7
+  elif dtype == "float16":
+    # Disable no-member lint error, as the linter thinks np.float16 does not
+    # exist for some reason.
+    return np.finfo(np.float16).max  # pylint: disable=no-member
+  else:
+    raise AssertionError("Invalid dtype: %s" % dtype)
+
+
+class _StateKeys(object):
+  """Keys to dictionary storing the state of the beam search loop."""
+
+  # Variable storing the loop index.
+  CUR_INDEX = "CUR_INDEX"
+
+  # Top sequences that are alive for each batch item. Alive sequences are ones
+  # that have not generated an EOS token. Sequences that reach EOS are marked as
+  # finished and moved to the FINISHED_SEQ tensor.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1]
+  ALIVE_SEQ = "ALIVE_SEQ"
+  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
+  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
+  # Dictionary of cached values for each alive sequence. The cache stores
+  # the encoder output, attention bias, and the decoder attention output from
+  # the previous iteration.
+  ALIVE_CACHE = "ALIVE_CACHE"
+
+  # Top finished sequences for each batch item.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
+  # shorter than CUR_INDEX + 1 are padded with 0s.
+  FINISHED_SEQ = "FINISHED_SEQ"
+  # Scores for each finished sequence. Score = log probability / length norm
+  # Shape [batch_size, beam_size]
+  FINISHED_SCORES = "FINISHED_SCORES"
+  # Flags indicating which sequences in the finished sequences are finished.
+  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
+  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
+  FINISHED_FLAGS = "FINISHED_FLAGS"
+
+
+def _expand_to_same_rank(tensor, target):
+  """Expands a given tensor to target's rank to be broadcastable.
+
+  Args:
+    tensor: input tensor to tile. Shape: [b, d1, ..., da]
+    target: target tensor. Shape: [b, d1, ..., da, ..., dn]
+
+  Returns:
+    Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
+
+  Raises:
+    ValueError, if the shape rank of rank tensor/target is None.
+  """
+  if tensor.shape.rank is None:
+    raise ValueError("Expect rank for tensor shape, but got None.")
+  if target.shape.rank is None:
+    raise ValueError("Expect rank for target shape, but got None.")
+
+  with tf.name_scope("expand_rank"):
+    diff_rank = target.shape.rank - tensor.shape.rank
+    for _ in range(diff_rank):
+      tensor = tf.expand_dims(tensor, -1)
+    return tensor
+
+
+class SequenceBeamSearch(tf.Module):
+  """Implementation of beam search loop."""
+
+  def __init__(self,
+               symbols_to_logits_fn,
+               vocab_size,
+               beam_size,
+               alpha,
+               max_decode_length,
+               eos_id,
+               padded_decode,
+               dtype=tf.float32):
+    """Initialize sequence beam search.
+
+    Args:
+      symbols_to_logits_fn: A function to provide logits, which is the
+        interface to the Transformer model. The passed in arguments are: ids ->
+          A tensor with shape [batch_size * beam_size, index]. index -> A
+          scalar. cache -> A nested dictionary of tensors [batch_size *
+          beam_size, ...].
+        The function must return a tuple of logits and the updated cache: logits
+          -> A tensor with shape [batch * beam_size, vocab_size]. updated cache
+          -> A nested dictionary with the same structure as the input cache.
+      vocab_size: An integer, the size of the vocabulary, used for topk
+        computation.
+      beam_size: An integer, number of beams for beam search.
+      alpha: A float, defining the strength of length normalization.
+      max_decode_length: An integer, the maximum number of steps to decode a
+        sequence.
+      eos_id: An integer. ID of end of sentence token.
+      padded_decode: A bool, indicating if max_sequence_length padding is used
+        for beam search.
+      dtype: A tensorflow data type used for score computation. The default is
+        tf.float32.
+    """
+    self.symbols_to_logits_fn = symbols_to_logits_fn
+    self.vocab_size = vocab_size
+    self.beam_size = beam_size
+    self.alpha = alpha
+    self.max_decode_length = max_decode_length
+    self.eos_id = eos_id
+    self.padded_decode = padded_decode
+    self.dtype = tf.as_dtype(dtype)
+
+  def search(self, initial_ids, initial_cache):
+    """Beam search for sequences with highest scores.
+
+    Args:
+      initial_ids: initial ids to pass into the symbols_to_logits_fn. int tensor
+        with shape [batch_size, 1]
+      initial_cache: dictionary storing values to be passed into the
+        symbols_to_logits_fn.
+
+    Returns:
+      finished_seq and finished_scores.
+    """
+    batch_size = (
+        initial_ids.shape.as_list()[0]
+        if self.padded_decode else tf.shape(initial_ids)[0])
+    state, state_shapes = self._create_initial_state(initial_ids, initial_cache,
+                                                     batch_size)
+
+    def _grow_alive_seq(state):
+      """Grow alive sequences by one token, collect top 2*beam_size sequences.
+
+      2*beam_size sequences are collected because some sequences may have
+      reached the EOS token. 2*beam_size ensures that at least beam_size
+      sequences are still alive.
+
+      Args:
+        state: A dictionary with the current loop state.
+
+      Returns:
+        Tuple of
+        (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
+         Scores of returned sequences [batch_size, 2 * beam_size],
+         New alive cache, for each of the 2 * beam_size sequences)
+      """
+      i = state[_StateKeys.CUR_INDEX]
+      alive_seq = state[_StateKeys.ALIVE_SEQ]
+      alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+      alive_cache = state[_StateKeys.ALIVE_CACHE]
+
+      beams_to_keep = 2 * self.beam_size
+
+      # Get logits for the next candidate IDs for the alive sequences. Get the
+      # new cache values at the same time.
+      if self.padded_decode:
+        flat_ids = tf.reshape(
+            tf.slice(alive_seq, [0, 0, i], [batch_size, self.beam_size, 1]),
+            [batch_size * self.beam_size, -1])
+      else:
+        flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
+      flat_cache = tf.nest.map_structure(_flatten_beam_dim, alive_cache)
+
+      flat_logits, flat_cache = self.symbols_to_logits_fn(
+          flat_ids, i, flat_cache)
+
+      # Unflatten logits to shape [batch_size, beam_size, vocab_size]
+      logits = _unflatten_beam_dim(flat_logits, batch_size, self.beam_size)
+      new_cache = tf.nest.map_structure(
+          lambda t: _unflatten_beam_dim(t, batch_size, self.beam_size),
+          flat_cache)
+
+      # Convert logits to normalized log probs
+      candidate_log_probs = _log_prob_from_logits(logits)
+
+      # Calculate new log probabilities if each of the alive sequences were
+      # extended # by the the candidate IDs.
+      # Shape [batch_size, beam_size, vocab_size]
+      log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
+
+      # Each batch item has beam_size * vocab_size candidate sequences. For each
+      # batch item, get the k candidates with the highest log probabilities.
+      flat_log_probs = tf.reshape(log_probs,
+                                  [-1, self.beam_size * self.vocab_size])
+      topk_log_probs, topk_indices = tf.nn.top_k(
+          flat_log_probs, k=beams_to_keep)
+
+      # Extract the alive sequences that generate the highest log probabilities
+      # after being extended.
+      topk_beam_indices = topk_indices // self.vocab_size
+      topk_seq, new_cache = _gather_beams([alive_seq, new_cache],
+                                          topk_beam_indices, batch_size,
+                                          beams_to_keep)
+
+      # Append the most probable IDs to the topk sequences
+      topk_ids = topk_indices % self.vocab_size
+      if self.padded_decode:
+        topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
+        # TODO(b/145533236, hongkuny): Reverts once TF fix the validation.
+        topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]],
+                                               tf.expand_dims(topk_ids, axis=0))
+        topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
+      else:
+        topk_seq = tf.concat(
+            [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
+      return topk_seq, topk_log_probs, topk_ids, new_cache
+
+    def _get_new_alive_state(new_seq, new_log_probs, new_finished_flags,
+                             new_cache):
+      """Gather the top k sequences that are still alive.
+
+      Args:
+        new_seq: New sequences generated by growing the current alive sequences
+          int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
+        new_log_probs: Log probabilities of new sequences float32 tensor with
+          shape [batch_size, beam_size]
+        new_finished_flags: A boolean Tensor indicates which sequences are live
+          inside the beam.
+        new_cache: Dict of cached values for each sequence.
+
+      Returns:
+        Dictionary with alive keys from _StateKeys:
+          {Top beam_size sequences that are still alive (don't end with eos_id)
+           Log probabilities of top alive sequences
+           Dict cache storing decoder states for top alive sequences}
+      """
+      # To prevent finished sequences from being considered, set log probs to
+      # -inf.
+      new_log_probs += tf.cast(new_finished_flags,
+                               self.dtype) * -inf(self.dtype)
+
+      top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
+          [new_seq, new_log_probs, new_cache], new_log_probs, batch_size,
+          self.beam_size)
+
+      return {
+          _StateKeys.ALIVE_SEQ: top_alive_seq,
+          _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
+          _StateKeys.ALIVE_CACHE: top_alive_cache
+      }
+
+    def _get_new_finished_state(state, new_seq, new_log_probs,
+                                new_finished_flags):
+      """Combine new and old finished sequences, and gather the top k sequences.
+
+      Args:
+        state: A dictionary with the current loop state.
+        new_seq: New sequences generated by growing the current alive sequences
+          int32 tensor with shape [batch_size, beam_size, i + 1]
+        new_log_probs: Log probabilities of new sequences float32 tensor with
+          shape [batch_size, beam_size]
+        new_finished_flags: A boolean Tensor indicates which sequences are live
+          inside the beam.
+
+      Returns:
+        Dictionary with finished keys from _StateKeys:
+          {Top beam_size finished sequences based on score,
+           Scores of finished sequences,
+           Finished flags of finished sequences}
+      """
+      i = state[_StateKeys.CUR_INDEX]
+      finished_seq = state[_StateKeys.FINISHED_SEQ]
+      finished_scores = state[_StateKeys.FINISHED_SCORES]
+      finished_flags = state[_StateKeys.FINISHED_FLAGS]
+
+      # First append a column of 0-ids to finished_seq to increment the length.
+      # New shape of finished_seq: [batch_size, beam_size, i + 1]
+      if not self.padded_decode:
+        finished_seq = tf.concat(
+            [finished_seq,
+             tf.zeros([batch_size, self.beam_size, 1], tf.int32)],
+            axis=2)
+
+      # Calculate new seq scores from log probabilities.
+      length_norm = _length_normalization(self.alpha, i + 1, dtype=self.dtype)
+      new_scores = new_log_probs / length_norm
+
+      # Set the scores of the still-alive seq in new_seq to large negative
+      # values.
+      new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) *
+                     -inf(self.dtype))
+
+      # Combine sequences, scores, and flags.
+      finished_seq = tf.concat([finished_seq, new_seq], axis=1)
+      finished_scores = tf.concat([finished_scores, new_scores], axis=1)
+      finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
+
+      # Return the finished sequences with the best scores.
+      top_finished_seq, top_finished_scores, top_finished_flags = (
+          _gather_topk_beams([finished_seq, finished_scores, finished_flags],
+                             finished_scores, batch_size, self.beam_size))
+
+      return {
+          _StateKeys.FINISHED_SEQ: top_finished_seq,
+          _StateKeys.FINISHED_SCORES: top_finished_scores,
+          _StateKeys.FINISHED_FLAGS: top_finished_flags
+      }
+
+    def _search_step(state):
+      """Beam search loop body.
+
+      Grow alive sequences by a single ID. Sequences that have reached the EOS
+      token are marked as finished. The alive and finished sequences with the
+      highest log probabilities and scores are returned.
+
+      A sequence's finished score is calculating by dividing the log probability
+      by the length normalization factor. Without length normalization, the
+      search is more likely to return shorter sequences.
+
+      Args:
+        state: A dictionary with the current loop state.
+
+      Returns:
+        new state dictionary.
+      """
+      # Grow alive sequences by one token.
+      new_seq, new_log_probs, topk_ids, new_cache = _grow_alive_seq(state)
+      new_finished_flags = tf.equal(topk_ids, self.eos_id)
+      # Collect top beam_size alive sequences
+      alive_state = _get_new_alive_state(new_seq, new_log_probs,
+                                         new_finished_flags, new_cache)
+
+      # Combine newly finished sequences with existing finished sequences, and
+      # collect the top k scoring sequences.
+      finished_state = _get_new_finished_state(state, new_seq, new_log_probs,
+                                               new_finished_flags)
+
+      # Increment loop index and create new state dictionary
+      new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
+      new_state.update(alive_state)
+      new_state.update(finished_state)
+      return [new_state]
+
+    finished_state = tf.nest.map_structure(
+        tf.stop_gradient,
+        tf.while_loop(
+            self._continue_search,
+            _search_step,
+            loop_vars=[state],
+            shape_invariants=[state_shapes],
+            parallel_iterations=1))
+    finished_state = finished_state[0]
+    return self._process_finished_state(finished_state)
+
+  def _process_finished_state(self, finished_state):
+    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
+    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
+    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
+    finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
+    finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
+    # TF2 changes tf.where behavior. Should make parameters broadcastable.
+    finished_cond = tf.reduce_any(finished_flags, 1, name="finished_cond")
+    seq_cond = _expand_to_same_rank(finished_cond, finished_seq)
+    score_cond = _expand_to_same_rank(finished_cond, finished_scores)
+
+    # Account for corner case where there are no finished sequences for a
+    # particular batch item. In that case, return alive sequences for that batch
+    # item.
+    finished_seq = tf.where(seq_cond, finished_seq, alive_seq)
+    finished_scores = tf.where(score_cond, finished_scores, alive_log_probs)
+    return finished_seq, finished_scores
+
+  def _create_initial_state(self, initial_ids, initial_cache, batch_size):
+    """Return initial state dictionary and its shape invariants."""
+    for key, value in initial_cache.items():
+      for inner_value in tf.nest.flatten(value):
+        if inner_value.dtype != self.dtype:
+          raise TypeError(
+              "initial_cache element for key '%s' has dtype %s that does not "
+              "match SequenceBeamSearch's dtype of %s. Value: %s" %
+              (key, value.dtype.name, self.dtype.name, inner_value))
+
+    # Current loop index (starts at 0)
+    cur_index = tf.constant(0)
+
+    # Create alive sequence with shape [batch_size, beam_size, 1]
+    alive_seq = _expand_to_beam_size(initial_ids, self.beam_size)
+    alive_seq = tf.expand_dims(alive_seq, axis=2)
+    if self.padded_decode:
+      alive_seq = tf.tile(alive_seq, [1, 1, self.max_decode_length + 1])
+
+    # Create tensor for storing initial log probabilities.
+    # Assume initial_ids are prob 1.0
+    initial_log_probs = tf.constant([[0.] + [-float("inf")] *
+                                     (self.beam_size - 1)],
+                                    dtype=self.dtype)
+    alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
+
+    # Expand all values stored in the dictionary to the beam size, so that each
+    # beam has a separate cache.
+    alive_cache = tf.nest.map_structure(
+        lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
+
+    # Initialize tensor storing finished sequences with filler values.
+    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+
+    # Set scores of the initial finished seqs to negative infinity.
+    finished_scores = tf.ones([batch_size, self.beam_size],
+                              dtype=self.dtype) * -inf(self.dtype)
+
+    # Initialize finished flags with all False values.
+    finished_flags = tf.zeros([batch_size, self.beam_size], tf.bool)
+
+    # Create state dictionary
+    state = {
+        _StateKeys.CUR_INDEX: cur_index,
+        _StateKeys.ALIVE_SEQ: alive_seq,
+        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+        _StateKeys.ALIVE_CACHE: alive_cache,
+        _StateKeys.FINISHED_SEQ: finished_seq,
+        _StateKeys.FINISHED_SCORES: finished_scores,
+        _StateKeys.FINISHED_FLAGS: finished_flags
+    }
+
+    # Create state invariants for each value in the state dictionary. Each
+    # dimension must be a constant or None. A None dimension means either:
+    #   1) the dimension's value is a tensor that remains the same but may
+    #      depend on the input sequence to the model (e.g. batch size).
+    #   2) the dimension may have different values on different iterations.
+    if self.padded_decode:
+      state_shape_invariants = {
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.beam_size, self.max_decode_length + 1]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([batch_size, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(_get_shape, alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.beam_size, self.max_decode_length + 1]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([batch_size, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([batch_size, self.beam_size])
+      }
+    else:
+      state_shape_invariants = {
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(_get_shape_keep_last_dim, alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([None, self.beam_size])
+      }
+
+    return state, state_shape_invariants
+
+  def _continue_search(self, state):
+    """Return whether to continue the search loop.
+
+    The loops should terminate when
+      1) when decode length has been reached, or
+      2) when the worst score in the finished sequences is better than the best
+         score in the alive sequences (i.e. the finished sequences are provably
+         unchanging)
+
+    Args:
+      state: A dictionary with the current loop state.
+
+    Returns:
+      Bool tensor with value True if loop should continue, False if loop should
+      terminate.
+    """
+    i = state[_StateKeys.CUR_INDEX]
+    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+    finished_scores = state[_StateKeys.FINISHED_SCORES]
+    finished_flags = state[_StateKeys.FINISHED_FLAGS]
+
+    not_at_max_decode_length = tf.less(i, self.max_decode_length)
+
+    # Calculate largest length penalty (the larger penalty, the better score).
+    max_length_norm = _length_normalization(
+        self.alpha, self.max_decode_length, dtype=self.dtype)
+    # Get the best possible scores from alive sequences.
+    best_alive_scores = alive_log_probs[:, 0] / max_length_norm
+
+    # Compute worst score in finished sequences for each batch element
+    finished_scores *= tf.cast(finished_flags,
+                               self.dtype)  # set filler scores to zero
+    lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
+
+    # If there are no finished sequences in a batch element, then set the lowest
+    # finished score to -INF for that element.
+    finished_batches = tf.reduce_any(finished_flags, 1)
+    lowest_finished_scores += ((1.0 - tf.cast(finished_batches, self.dtype)) *
+                               -inf(self.dtype))
+
+    worst_finished_score_better_than_best_alive_score = tf.reduce_all(
+        tf.greater(lowest_finished_scores, best_alive_scores))
+
+    return tf.logical_and(
+        not_at_max_decode_length,
+        tf.logical_not(worst_finished_score_better_than_best_alive_score))
+
+
+def sequence_beam_search(symbols_to_logits_fn,
+                         initial_ids,
+                         initial_cache,
+                         vocab_size,
+                         beam_size,
+                         alpha,
+                         max_decode_length,
+                         eos_id,
+                         padded_decode=False,
+                         dtype="float32"):
+  """Search for sequence of subtoken ids with the largest probability.
+
+  Args:
+    symbols_to_logits_fn: A function that takes in ids, index, and cache as
+      arguments. The passed in arguments will have shape: ids -> A tensor with
+        shape [batch_size * beam_size, index]. index -> A scalar. cache -> A
+        nested dictionary of tensors [batch_size * beam_size, ...].
+      The function must return a tuple of logits and new cache: logits -> A
+        tensor with shape [batch * beam_size, vocab_size]. new cache -> A nested
+        dictionary with the same shape/structure as the inputted cache.
+    initial_ids: An int32 tensor with shape [batch_size]. Starting ids for each
+      batch item.
+    initial_cache: A dictionary, containing starting decoder variables
+      information.
+    vocab_size: An integer, the size of tokens.
+    beam_size: An integer, the number of beams.
+    alpha: A float, defining the strength of length normalization.
+    max_decode_length: An integer, the maximum length to decoded a sequence.
+    eos_id: An integer, ID of eos token, used to determine when a sequence has
+      finished.
+    padded_decode: A bool, indicating if max_sequence_length padding is used for
+      beam search.
+    dtype: A tensorflow data type used for score computation. The default is
+      tf.float32.
+
+  Returns:
+    Top decoded sequences [batch_size, beam_size, max_decode_length]
+    sequence scores [batch_size, beam_size]
+  """
+  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, beam_size, alpha,
+                           max_decode_length, eos_id, padded_decode, dtype)
+  return sbs.search(initial_ids, initial_cache)
+
+
+def _log_prob_from_logits(logits):
+  return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
+
+
+def _length_normalization(alpha, length, dtype=tf.float32):
+  """Return length normalization factor."""
+  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), alpha)
+
+
+def _expand_to_beam_size(tensor, beam_size):
+  """Tiles a given tensor by beam_size.
+
+  Args:
+    tensor: tensor to tile [batch_size, ...]
+    beam_size: How much to tile the tensor by.
+
+  Returns:
+    Tiled tensor [batch_size, beam_size, ...]
+  """
+  tensor = tf.expand_dims(tensor, axis=1)
+  tile_dims = [1] * tensor.shape.ndims
+  tile_dims[1] = beam_size
+
+  return tf.tile(tensor, tile_dims)
+
+
+def _shape_list(tensor):
+  """Return a list of the tensor's shape, and ensure no None values in list."""
+  # Get statically known shape (may contain None's for unknown dimensions)
+  shape = tensor.get_shape().as_list()
+
+  # Ensure that the shape values are not None
+  dynamic_shape = tf.shape(tensor)
+  for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
+    if shape[i] is None:
+      shape[i] = dynamic_shape[i]
+  return shape
+
+
+def _get_shape_keep_last_dim(tensor):
+  shape_list = _shape_list(tensor)
+
+  # Only the last
+  for i in range(len(shape_list) - 1):
+    shape_list[i] = None
+
+  if isinstance(shape_list[-1], tf.Tensor):
+    shape_list[-1] = None
+  return tf.TensorShape(shape_list)
+
+
+def _get_shape(tensor):
+  """Return the shape of the input tensor."""
+  return tf.TensorShape(_shape_list(tensor))
+
+
+def _flatten_beam_dim(tensor):
+  """Reshapes first two dimensions in to single dimension.
+
+  Args:
+    tensor: Tensor to reshape of shape [A, B, ...]
+
+  Returns:
+    Reshaped tensor of shape [A*B, ...]
+  """
+  shape = _shape_list(tensor)
+  shape[0] *= shape[1]
+  shape.pop(1)  # Remove beam dim
+  return tf.reshape(tensor, shape)
+
+
+def _unflatten_beam_dim(tensor, batch_size, beam_size):
+  """Reshapes first dimension back to [batch_size, beam_size].
+
+  Args:
+    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
+    batch_size: Tensor, original batch size.
+    beam_size: int, original beam size.
+
+  Returns:
+    Reshaped tensor of shape [batch_size, beam_size, ...]
+  """
+  shape = _shape_list(tensor)
+  new_shape = [batch_size, beam_size] + shape[1:]
+  return tf.reshape(tensor, new_shape)
+
+
+def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
+  """Gather beams from nested structure of tensors.
+
+  Each tensor in nested represents a batch of beams, where beam refers to a
+  single search state (beam search involves searching through multiple states
+  in parallel).
+
+  This function is used to gather the top beams, specified by
+  beam_indices, from the nested tensors.
+
+  Args:
+    nested: Nested structure (tensor, list, tuple or dict) containing tensors
+      with shape [batch_size, beam_size, ...].
+    beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
+      value in beam_indices must be between [0, beam_size), and are not
+      necessarily unique.
+    batch_size: int size of batch
+    new_beam_size: int number of beams to be pulled from the nested tensors.
+
+  Returns:
+    Nested structure containing tensors with shape
+      [batch_size, new_beam_size, ...]
+  """
+  # Computes the i'th coodinate that contains the batch index for gather_nd.
+  # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
+  batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
+  batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
+
+  # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
+  # with shape [batch_size, beam_size, 2], where the last dimension contains
+  # the (i, j) gathering coordinates.
+  coordinates = tf.stack([batch_pos, beam_indices], axis=2)
+
+  return tf.nest.map_structure(lambda state: tf.gather_nd(state, coordinates),
+                               nested)
+
+
+def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
+  """Gather top beams from nested structure."""
+  _, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size)
+  return _gather_beams(nested, topk_indexes, batch_size, beam_size)
--- a/official/nlp/transformer/beam_search_v1_test.py
+++ b/official/nlp/transformer/beam_search_v1_test.py
@@ -14,33 +14,19 @@
 # ==============================================================================
 """Test beam search helper methods."""

-import tensorflow.compat.v1 as tf
+import tensorflow as tf

-from official.nlp.transformer import beam_search_v1 as beam_search
+from official.nlp.modeling.ops import beam_search


 class BeamSearchHelperTests(tf.test.TestCase):

-  def setUp(self):
-    super(BeamSearchHelperTests, self).setUp()
-    tf.compat.v1.disable_eager_execution()
-
  def test_expand_to_beam_size(self):
    x = tf.ones([7, 4, 2, 5])
    x = beam_search._expand_to_beam_size(x, 3)
-    with self.session() as sess:
-      shape = sess.run(tf.shape(x))
+    shape = tf.shape(x)
    self.assertAllEqual([7, 3, 4, 2, 5], shape)

-  def test_shape_list(self):
-    y = tf.compat.v1.placeholder(dtype=tf.int32, shape=[])
-    x = tf.ones([7, y, 2, 5])
-    shape = beam_search._shape_list(x)
-    self.assertIsInstance(shape[0], int)
-    self.assertIsInstance(shape[1], tf.Tensor)
-    self.assertIsInstance(shape[2], int)
-    self.assertIsInstance(shape[3], int)
-
  def test_get_shape_keep_last_dim(self):
    y = tf.constant(4.0)
    x = tf.ones([7, tf.cast(tf.sqrt(y), tf.int32), 2, 5])
@@ -51,16 +37,12 @@ class BeamSearchHelperTests(tf.test.TestCase):
  def test_flatten_beam_dim(self):
    x = tf.ones([7, 4, 2, 5])
    x = beam_search._flatten_beam_dim(x)
-    with self.session() as sess:
-      shape = sess.run(tf.shape(x))
-    self.assertAllEqual([28, 2, 5], shape)
+    self.assertAllEqual([28, 2, 5], tf.shape(x))

  def test_unflatten_beam_dim(self):
    x = tf.ones([28, 2, 5])
    x = beam_search._unflatten_beam_dim(x, 7, 4)
-    with self.session() as sess:
-      shape = sess.run(tf.shape(x))
-    self.assertAllEqual([7, 4, 2, 5], shape)
+    self.assertAllEqual([7, 4, 2, 5], tf.shape(x))

  def test_gather_beams(self):
    x = tf.reshape(tf.range(24), [2, 3, 4])
@@ -73,9 +55,6 @@ class BeamSearchHelperTests(tf.test.TestCase):
    #                  [20 21 22 23]]]

    y = beam_search._gather_beams(x, [[1, 2], [0, 2]], 2, 2)
-    with self.session() as sess:
-      y = sess.run(y)
-
    self.assertAllEqual([[[4, 5, 6, 7],
                          [8, 9, 10, 11]],
                         [[12, 13, 14, 15],
@@ -87,9 +66,6 @@ class BeamSearchHelperTests(tf.test.TestCase):
    x_scores = [[0, 1, 1], [1, 0, 1]]

    y = beam_search._gather_topk_beams(x, x_scores, 2, 2)
-    with self.session() as sess:
-      y = sess.run(y)
-
    self.assertAllEqual([[[4, 5, 6, 7],
                          [8, 9, 10, 11]],
                         [[12, 13, 14, 15],

--- a/official/nlp/nhnet/models.py
+++ b/official/nlp/nhnet/models.py
@@ -31,7 +31,7 @@ from official.nlp.modeling.layers import multi_channel_attention
 from official.nlp.nhnet import configs
 from official.nlp.nhnet import decoder
 from official.nlp.nhnet import utils
-from official.nlp.transformer import beam_search
+from official.nlp.modeling.ops import beam_search


 def embedding_linear(embedding_matrix, x):

--- a/official/nlp/tasks/electra_task.py
+++ b/official/nlp/tasks/electra_task.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ELECTRA pretraining task (Joint Masked LM and Replaced Token Detection)."""
+import dataclasses
+import tensorflow as tf
+
+from official.core import base_task
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.configs import bert
+from official.nlp.configs import electra
+from official.nlp.data import pretrain_dataloader
+
+
+@dataclasses.dataclass
+class ELECTRAPretrainConfig(cfg.TaskConfig):
+  """The model config."""
+  model: electra.ELECTRAPretrainerConfig = electra.ELECTRAPretrainerConfig(
+      cls_heads=[
+          bert.ClsHeadConfig(
+              inner_dim=768,
+              num_classes=2,
+              dropout_rate=0.1,
+              name='next_sentence')
+      ])
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+
+
+@base_task.register_task_cls(ELECTRAPretrainConfig)
+class ELECTRAPretrainTask(base_task.Task):
+  """ELECTRA Pretrain Task (Masked LM + Replaced Token Detection)."""
+
+  def build_model(self):
+    return electra.instantiate_pretrainer_from_cfg(
+        self.task_config.model)
+
+  def build_losses(self,
+                   labels,
+                   model_outputs,
+                   metrics,
+                   aux_losses=None) -> tf.Tensor:
+    metrics = dict([(metric.name, metric) for metric in metrics])
+
+    # generator lm and (optional) nsp loss.
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        labels['masked_lm_ids'],
+        tf.cast(model_outputs['lm_outputs'], tf.float32),
+        from_logits=True)
+    lm_label_weights = labels['masked_lm_weights']
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
+    metrics['lm_example_loss'].update_state(mlm_loss)
+    if 'next_sentence_labels' in labels:
+      sentence_labels = labels['next_sentence_labels']
+      sentence_outputs = tf.cast(
+          model_outputs['sentence_outputs'], dtype=tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels,
+          sentence_outputs,
+          from_logits=True)
+      metrics['next_sentence_loss'].update_state(sentence_loss)
+      total_loss = mlm_loss + sentence_loss
+    else:
+      total_loss = mlm_loss
+
+    # discriminator replaced token detection (rtd) loss.
+    rtd_logits = model_outputs['disc_logits']
+    rtd_labels = tf.cast(model_outputs['disc_label'], tf.float32)
+    input_mask = tf.cast(labels['input_mask'], tf.float32)
+    rtd_ind_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        logits=rtd_logits, labels=rtd_labels)
+    rtd_numerator = tf.reduce_sum(input_mask * rtd_ind_loss)
+    rtd_denominator = tf.reduce_sum(input_mask)
+    rtd_loss = tf.math.divide_no_nan(rtd_numerator, rtd_denominator)
+    metrics['discriminator_loss'].update_state(rtd_loss)
+    total_loss = total_loss + \
+        self.task_config.model.discriminator_loss_weight * rtd_loss
+
+    if aux_losses:
+      total_loss += tf.add_n(aux_losses)
+
+    metrics['total_loss'].update_state(total_loss)
+    return total_loss
+
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for pretraining."""
+    if params.input_path == 'dummy':
+
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
+        return dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids,
+            masked_lm_positions=dummy_lm,
+            masked_lm_ids=dummy_lm,
+            masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32),
+            next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32))
+
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    return pretrain_dataloader.BertPretrainDataLoader(params).load(
+        input_context)
+
+  def build_metrics(self, training=None):
+    del training
+    metrics = [
+        tf.keras.metrics.SparseCategoricalAccuracy(name='masked_lm_accuracy'),
+        tf.keras.metrics.Mean(name='lm_example_loss'),
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='discriminator_accuracy'),
+    ]
+    if self.task_config.train_data.use_next_sentence_label:
+      metrics.append(
+          tf.keras.metrics.SparseCategoricalAccuracy(
+              name='next_sentence_accuracy'))
+      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
+
+    metrics.append(tf.keras.metrics.Mean(name='discriminator_loss'))
+    metrics.append(tf.keras.metrics.Mean(name='total_loss'))
+
+    return metrics
+
+  def process_metrics(self, metrics, labels, model_outputs):
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    if 'masked_lm_accuracy' in metrics:
+      metrics['masked_lm_accuracy'].update_state(labels['masked_lm_ids'],
+                                                 model_outputs['lm_outputs'],
+                                                 labels['masked_lm_weights'])
+    if 'next_sentence_accuracy' in metrics:
+      metrics['next_sentence_accuracy'].update_state(
+          labels['next_sentence_labels'], model_outputs['sentence_outputs'])
+    if 'discriminator_accuracy' in metrics:
+      disc_logits_expanded = tf.expand_dims(model_outputs['disc_logits'], -1)
+      discrim_full_logits = tf.concat(
+          [-1.0 * disc_logits_expanded, disc_logits_expanded], -1)
+      metrics['discriminator_accuracy'].update_state(
+          model_outputs['disc_label'], discrim_full_logits,
+          labels['input_mask'])
+
+  def train_step(self, inputs, model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer, metrics):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    with tf.GradientTape() as tape:
+      outputs = model(inputs, training=True)
+      # Computes per-replica loss.
+      loss = self.build_losses(
+          labels=inputs,
+          model_outputs=outputs,
+          metrics=metrics,
+          aux_losses=model.losses)
+      # Scales loss as the default gradients allreduce performs sum inside the
+      # optimizer.
+      # TODO(b/154564893): enable loss scaling.
+      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    outputs = model(inputs, training=False)
+    loss = self.build_losses(
+        labels=inputs,
+        model_outputs=outputs,
+        metrics=metrics,
+        aux_losses=model.losses)
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
--- a/official/nlp/tasks/electra_task_test.py
+++ b/official/nlp/tasks/electra_task_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.nlp.tasks.electra_task."""
+
+import tensorflow as tf
+
+from official.nlp.configs import bert
+from official.nlp.configs import electra
+from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
+from official.nlp.tasks import electra_task
+
+
+class ELECTRAPretrainTaskTest(tf.test.TestCase):
+
+  def test_task(self):
+    config = electra_task.ELECTRAPretrainConfig(
+        model=electra.ELECTRAPretrainerConfig(
+            generator_encoder=encoders.TransformerEncoderConfig(
+                vocab_size=30522, num_layers=1),
+            discriminator_encoder=encoders.TransformerEncoderConfig(
+                vocab_size=30522, num_layers=1),
+            num_masked_tokens=20,
+            sequence_length=128,
+            cls_heads=[
+                bert.ClsHeadConfig(
+                    inner_dim=10, num_classes=2, name="next_sentence")
+            ]),
+        train_data=pretrain_dataloader.BertPretrainDataConfig(
+            input_path="dummy",
+            max_predictions_per_seq=20,
+            seq_length=128,
+            global_batch_size=1))
+    task = electra_task.ELECTRAPretrainTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    dataset = task.build_inputs(config.train_data)
+
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/tasks/masked_lm.py
+++ b/official/nlp/tasks/masked_lm.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 """Masked language task."""
+from absl import logging
 import dataclasses
 import tensorflow as tf

@@ -26,6 +27,7 @@ from official.nlp.data import data_loader_factory
 @dataclasses.dataclass
 class MaskedLMConfig(cfg.TaskConfig):
  """The model config."""
+  init_checkpoint: str = ''
  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(cls_heads=[
      bert.ClsHeadConfig(
          inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence')
@@ -38,8 +40,9 @@ class MaskedLMConfig(cfg.TaskConfig):
 class MaskedLMTask(base_task.Task):
  """Mock task object for testing."""

-  def build_model(self):
-    return bert.instantiate_bertpretrainer_from_cfg(self.task_config.model)
+  def build_model(self, params=None):
+    params = params or self.task_config.model
+    return bert.instantiate_pretrainer_from_cfg(params)

  def build_losses(self,
                   labels,
@@ -60,10 +63,10 @@ class MaskedLMTask(base_task.Task):
      sentence_labels = labels['next_sentence_labels']
      sentence_outputs = tf.cast(
          model_outputs['next_sentence'], dtype=tf.float32)
-      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
-          sentence_labels,
-          sentence_outputs,
-          from_logits=True)
+      sentence_loss = tf.reduce_mean(
+          tf.keras.losses.sparse_categorical_crossentropy(sentence_labels,
+                                                          sentence_outputs,
+                                                          from_logits=True))
      metrics['next_sentence_loss'].update_state(sentence_loss)
      total_loss = mlm_loss + sentence_loss
    else:
@@ -171,3 +174,17 @@ class MaskedLMTask(base_task.Task):
        aux_losses=model.losses)
    self.process_metrics(metrics, inputs, outputs)
    return {self.loss: loss}
+
+  def initialize(self, model: tf.keras.Model):
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+    # Restoring all modules defined by the model, e.g. encoder, masked_lm and
+    # cls pooler. The best initialization may vary case by case.
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    status = ckpt.read(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
--- a/official/nlp/tasks/masked_lm_test.py
+++ b/official/nlp/tasks/masked_lm_test.py
@@ -27,9 +27,9 @@ class MLMTaskTest(tf.test.TestCase):

  def test_task(self):
    config = masked_lm.MaskedLMConfig(
+        init_checkpoint=self.get_temp_dir(),
        model=bert.BertPretrainerConfig(
            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
-            num_masked_tokens=20,
            cls_heads=[
                bert.ClsHeadConfig(
                    inner_dim=10, num_classes=2, name="next_sentence")
@@ -49,6 +49,12 @@ class MLMTaskTest(tf.test.TestCase):
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)

+    # Saves a checkpoint.
+    ckpt = tf.train.Checkpoint(
+        model=model, **model.checkpoint_items)
+    ckpt.save(config.init_checkpoint)
+    task.initialize(model)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tasks/question_answering.py
+++ b/official/nlp/tasks/question_answering.py
@@ -23,18 +23,26 @@ import tensorflow as tf
 import tensorflow_hub as hub

 from official.core import base_task
+from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
-from official.nlp.bert import input_pipeline
 from official.nlp.bert import squad_evaluate_v1_1
 from official.nlp.bert import squad_evaluate_v2_0
 from official.nlp.bert import tokenization
 from official.nlp.configs import encoders
+from official.nlp.data import data_loader_factory
 from official.nlp.data import squad_lib as squad_lib_wp
 from official.nlp.data import squad_lib_sp
 from official.nlp.modeling import models
 from official.nlp.tasks import utils


+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A base span labeler configuration."""
+  encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+
+
 @dataclasses.dataclass
 class QuestionAnsweringConfig(cfg.TaskConfig):
  """The model config."""
@@ -44,8 +52,7 @@ class QuestionAnsweringConfig(cfg.TaskConfig):
  n_best_size: int = 20
  max_answer_length: int = 30
  null_score_diff_threshold: float = 0.0
-  model: encoders.TransformerEncoderConfig = (
-      encoders.TransformerEncoderConfig())
+  model: ModelConfig = ModelConfig()
  train_data: cfg.DataConfig = cfg.DataConfig()
  validation_data: cfg.DataConfig = cfg.DataConfig()

@@ -81,12 +88,12 @@ class QuestionAnsweringTask(base_task.Task):
      encoder_network = utils.get_encoder_from_hub(self._hub_module)
    else:
      encoder_network = encoders.instantiate_encoder_from_cfg(
-          self.task_config.model)
-
+          self.task_config.model.encoder)
+    # Currently, we only supports bert-style question answering finetuning.
    return models.BertSpanLabeler(
        network=encoder_network,
        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=self.task_config.model.initializer_range))
+            stddev=self.task_config.model.encoder.initializer_range))

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
    start_positions = labels['start_positions']
@@ -174,20 +181,13 @@ class QuestionAnsweringTask(base_task.Task):
      return dataset

    if params.is_training:
-      input_path = params.input_path
+      dataloader_params = params
    else:
      input_path = self._tf_record_input_path
+      dataloader_params = params.replace(input_path=input_path)

-    batch_size = input_context.get_per_replica_batch_size(
-        params.global_batch_size) if input_context else params.global_batch_size
-    # TODO(chendouble): add and use nlp.data.question_answering_dataloader.
-    dataset = input_pipeline.create_squad_dataset(
-        input_path,
-        params.seq_length,
-        batch_size,
-        is_training=params.is_training,
-        input_pipeline_context=input_context)
-    return dataset
+    return data_loader_factory.get_data_loader(
+        dataloader_params).load(input_context)

  def build_metrics(self, training=None):
    del training
@@ -289,5 +289,5 @@ class QuestionAnsweringTask(base_task.Task):
    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
    status = ckpt.read(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
-    logging.info('finished loading pretrained checkpoint from %s',
+    logging.info('Finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
--- a/official/nlp/tasks/question_answering_test.py
+++ b/official/nlp/tasks/question_answering_test.py
@@ -24,6 +24,7 @@ from official.nlp.bert import configs
 from official.nlp.bert import export_tfhub
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
+from official.nlp.data import question_answering_dataloader
 from official.nlp.tasks import question_answering


@@ -33,7 +34,7 @@ class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
    super(QuestionAnsweringTaskTest, self).setUp()
    self._encoder_config = encoders.TransformerEncoderConfig(
        vocab_size=30522, num_layers=1)
-    self._train_data_config = bert.QADataConfig(
+    self._train_data_config = question_answering_dataloader.QADataConfig(
        input_path="dummy",
        seq_length=128,
        global_batch_size=1)
@@ -55,7 +56,8 @@ class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
      writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")

  def _get_validation_data_config(self, version_2_with_negative=False):
-    return bert.QADevDataConfig(
+    return question_answering_dataloader.QADataConfig(
+        is_training=False,
        input_path=self._val_input_path,
        input_preprocessed_data_path=self.get_temp_dir(),
        seq_length=128,
@@ -91,19 +93,18 @@ class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
    # Saves a checkpoint.
    pretrain_cfg = bert.BertPretrainerConfig(
        encoder=self._encoder_config,
-        num_masked_tokens=20,
        cls_heads=[
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=3, name="next_sentence")
        ])
-    pretrain_model = bert.instantiate_bertpretrainer_from_cfg(pretrain_cfg)
+    pretrain_model = bert.instantiate_pretrainer_from_cfg(pretrain_cfg)
    ckpt = tf.train.Checkpoint(
        model=pretrain_model, **pretrain_model.checkpoint_items)
    saved_path = ckpt.save(self.get_temp_dir())

    config = question_answering.QuestionAnsweringConfig(
        init_checkpoint=saved_path,
-        model=self._encoder_config,
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
        train_data=self._train_data_config,
        validation_data=self._get_validation_data_config(
            version_2_with_negative))
@@ -111,7 +112,7 @@ class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):

  def test_task_with_fit(self):
    config = question_answering.QuestionAnsweringConfig(
-        model=self._encoder_config,
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
        train_data=self._train_data_config,
        validation_data=self._get_validation_data_config())
    task = question_answering.QuestionAnsweringTask(config)
@@ -154,7 +155,7 @@ class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
    hub_module_url = self._export_bert_tfhub()
    config = question_answering.QuestionAnsweringConfig(
        hub_module_url=hub_module_url,
-        model=self._encoder_config,
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
        train_data=self._train_data_config,
        validation_data=self._get_validation_data_config())
    self._run_task(config)

--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -14,21 +14,39 @@
 # limitations under the License.
 # ==============================================================================
 """Sentence prediction (classification) task."""
+from typing import List, Union
+
 from absl import logging
 import dataclasses
 import numpy as np
+import orbit
 from scipy import stats
 from sklearn import metrics as sklearn_metrics
 import tensorflow as tf
 import tensorflow_hub as hub

 from official.core import base_task
+from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
-from official.nlp.configs import bert
+from official.nlp.configs import encoders
 from official.nlp.data import data_loader_factory
+from official.nlp.modeling import models
 from official.nlp.tasks import utils


+METRIC_TYPES = frozenset(
+    ['accuracy', 'matthews_corrcoef', 'pearson_spearman_corr'])
+
+
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A classifier/regressor configuration."""
+  num_classes: int = 0
+  use_encoder_pooler: bool = False
+  encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+
+
 @dataclasses.dataclass
 class SentencePredictionConfig(cfg.TaskConfig):
  """The model config."""
@@ -38,15 +56,8 @@ class SentencePredictionConfig(cfg.TaskConfig):
  init_cls_pooler: bool = False
  hub_module_url: str = ''
  metric_type: str = 'accuracy'
-  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(
-      num_masked_tokens=0,  # No masked language modeling head.
-      cls_heads=[
-          bert.ClsHeadConfig(
-              inner_dim=768,
-              num_classes=3,
-              dropout_rate=0.1,
-              name='sentence_prediction')
-      ])
+  # Defines the concrete model config at instantiation time.
+  model: ModelConfig = ModelConfig()
  train_data: cfg.DataConfig = cfg.DataConfig()
  validation_data: cfg.DataConfig = cfg.DataConfig()

@@ -64,25 +75,36 @@ class SentencePredictionTask(base_task.Task):
      self._hub_module = hub.load(params.hub_module_url)
    else:
      self._hub_module = None
+
+    if params.metric_type not in METRIC_TYPES:
+      raise ValueError('Invalid metric_type: {}'.format(params.metric_type))
    self.metric_type = params.metric_type

  def build_model(self):
    if self._hub_module:
-      encoder_from_hub = utils.get_encoder_from_hub(self._hub_module)
-      return bert.instantiate_bertpretrainer_from_cfg(
-          self.task_config.model, encoder_network=encoder_from_hub)
+      encoder_network = utils.get_encoder_from_hub(self._hub_module)
    else:
-      return bert.instantiate_bertpretrainer_from_cfg(self.task_config.model)
+      encoder_network = encoders.instantiate_encoder_from_cfg(
+          self.task_config.model.encoder)
+
+    # Currently, we only support bert-style sentence prediction finetuning.
+    return models.BertClassifier(
+        network=encoder_network,
+        num_classes=self.task_config.model.num_classes,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=self.task_config.model.encoder.initializer_range),
+        use_encoder_pooler=self.task_config.model.use_encoder_pooler)

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    loss = tf.keras.losses.sparse_categorical_crossentropy(
-        labels,
-        tf.cast(model_outputs['sentence_prediction'], tf.float32),
-        from_logits=True)
+    if self.task_config.model.num_classes == 1:
+      loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
+    else:
+      loss = tf.keras.losses.sparse_categorical_crossentropy(
+          labels, tf.cast(model_outputs, tf.float32), from_logits=True)

    if aux_losses:
      loss += tf.add_n(aux_losses)
-    return loss
+    return tf.reduce_mean(loss)

  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
@@ -94,8 +116,12 @@ class SentencePredictionTask(base_task.Task):
            input_word_ids=dummy_ids,
            input_mask=dummy_ids,
            input_type_ids=dummy_ids)
-        y = tf.zeros((1, 1), dtype=tf.int32)
-        return (x, y)
+
+        if self.task_config.model.num_classes == 1:
+          y = tf.zeros((1,), dtype=tf.float32)
+        else:
+          y = tf.zeros((1, 1), dtype=tf.int32)
+        return x, y

      dataset = tf.data.Dataset.range(1)
      dataset = dataset.repeat()
@@ -107,15 +133,19 @@ class SentencePredictionTask(base_task.Task):

  def build_metrics(self, training=None):
    del training
-    metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
+    if self.task_config.model.num_classes == 1:
+      metrics = [tf.keras.metrics.MeanSquaredError()]
+    else:
+      metrics = [
+          tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
    return metrics

  def process_metrics(self, metrics, labels, model_outputs):
    for metric in metrics:
-      metric.update_state(labels, model_outputs['sentence_prediction'])
+      metric.update_state(labels, model_outputs)

  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    compiled_metrics.update_state(labels, model_outputs['sentence_prediction'])
+    compiled_metrics.update_state(labels, model_outputs)

  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
    if self.metric_type == 'accuracy':
@@ -129,15 +159,13 @@ class SentencePredictionTask(base_task.Task):
    if self.metric_type == 'matthews_corrcoef':
      logs.update({
          'sentence_prediction':
-              tf.expand_dims(
-                  tf.math.argmax(outputs['sentence_prediction'], axis=1),
-                  axis=0),
+              tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=0),
          'labels':
              labels,
      })
    if self.metric_type == 'pearson_spearman_corr':
      logs.update({
-          'sentence_prediction': outputs['sentence_prediction'],
+          'sentence_prediction': outputs,
          'labels': labels,
      })
    return logs
@@ -147,6 +175,7 @@ class SentencePredictionTask(base_task.Task):
      return None
    if state is None:
      state = {'sentence_prediction': [], 'labels': []}
+    # TODO(b/160712818): Add support for concatenating partial batches.
    state['sentence_prediction'].append(
        np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']],
                       axis=0))
@@ -155,15 +184,21 @@ class SentencePredictionTask(base_task.Task):
    return state

  def reduce_aggregated_logs(self, aggregated_logs):
-    if self.metric_type == 'matthews_corrcoef':
+    if self.metric_type == 'accuracy':
+      return None
+    elif self.metric_type == 'matthews_corrcoef':
      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
      return {
          self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels)
      }
-    if self.metric_type == 'pearson_spearman_corr':
+    elif self.metric_type == 'pearson_spearman_corr':
      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
      pearson_corr = stats.pearsonr(preds, labels)[0]
      spearman_corr = stats.spearmanr(preds, labels)[0]
      corr_metric = (pearson_corr + spearman_corr) / 2
@@ -189,5 +224,54 @@ class SentencePredictionTask(base_task.Task):
    ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping)
    status = ckpt.read(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
-    logging.info('finished loading pretrained checkpoint from %s',
+    logging.info('Finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
+
+
+def predict(task: SentencePredictionTask, params: cfg.DataConfig,
+            model: tf.keras.Model) -> List[Union[int, float]]:
+  """Predicts on the input data.
+
+  Args:
+    task: A `SentencePredictionTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+
+  Returns:
+    A list of predictions with length of `num_examples`. For regression task,
+      each element in the list is the predicted score; for classification task,
+      each element is the predicted class id.
+  """
+  is_regression = task.task_config.model.num_classes == 1
+
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+
+    def _replicated_step(inputs):
+      """Replicated prediction calculation."""
+      x, _ = inputs
+      outputs = task.inference_step(x, model)
+      if is_regression:
+        return outputs
+      else:
+        return tf.argmax(outputs, axis=-1)
+
+    outputs = tf.distribute.get_strategy().run(
+        _replicated_step, args=(next(iterator),))
+    return tf.nest.map_structure(
+        tf.distribute.get_strategy().experimental_local_results, outputs)
+
+  def reduce_fn(state, outputs):
+    """Concatenates model's outputs."""
+    for per_replica_batch_predictions in outputs:
+      state.extend(per_replica_batch_predictions)
+    return state
+
+  loop_fn = orbit.utils.create_loop_fn(predict_step)
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  # Set `num_steps` to -1 to exhaust the dataset.
+  predictions = loop_fn(
+      iter(dataset), num_steps=-1, state=[], reduce_fn=reduce_fn)
+  return predictions
--- a/official/nlp/tasks/sentence_prediction_test.py
+++ b/official/nlp/tasks/sentence_prediction_test.py
@@ -18,6 +18,7 @@ import functools
 import os

 from absl.testing import parameterized
+import numpy as np
 import tensorflow as tf

 from official.nlp.bert import configs
@@ -28,6 +29,35 @@ from official.nlp.data import sentence_prediction_dataloader
 from official.nlp.tasks import sentence_prediction


+def _create_fake_dataset(output_path, seq_length, num_classes, num_examples):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+
+  def create_int_feature(values):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+
+  def create_float_feature(values):
+    return tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+
+  for _ in range(num_examples):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+
+    if num_classes == 1:
+      features["label_ids"] = create_float_feature([np.random.random()])
+    else:
+      features["label_ids"] = create_int_feature(
+          [np.random.random_integers(0, num_classes - 1, size=())])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+
+
 class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):

  def setUp(self):
@@ -37,16 +67,10 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
            input_path="dummy", seq_length=128, global_batch_size=1))

  def get_model_config(self, num_classes):
-    return bert.BertPretrainerConfig(
+    return sentence_prediction.ModelConfig(
        encoder=encoders.TransformerEncoderConfig(
            vocab_size=30522, num_layers=1),
-        num_masked_tokens=0,
-        cls_heads=[
-            bert.ClsHeadConfig(
-                inner_dim=10,
-                num_classes=num_classes,
-                name="sentence_prediction")
-        ])
+        num_classes=num_classes)

  def _run_task(self, config):
    task = sentence_prediction.SentencePredictionTask(config)
@@ -81,17 +105,52 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
    pretrain_cfg = bert.BertPretrainerConfig(
        encoder=encoders.TransformerEncoderConfig(
            vocab_size=30522, num_layers=1),
-        num_masked_tokens=20,
        cls_heads=[
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=3, name="next_sentence")
        ])
-    pretrain_model = bert.instantiate_bertpretrainer_from_cfg(pretrain_cfg)
+    pretrain_model = bert.instantiate_pretrainer_from_cfg(pretrain_cfg)
    ckpt = tf.train.Checkpoint(
        model=pretrain_model, **pretrain_model.checkpoint_items)
    ckpt.save(config.init_checkpoint)
    task.initialize(model)

+  @parameterized.named_parameters(
+      {
+          "testcase_name": "regression",
+          "num_classes": 1,
+      },
+      {
+          "testcase_name": "classification",
+          "num_classes": 2,
+      },
+  )
+  def test_metrics_and_losses(self, num_classes):
+    config = sentence_prediction.SentencePredictionConfig(
+        init_checkpoint=self.get_temp_dir(),
+        model=self.get_model_config(num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    if num_classes == 1:
+      self.assertIsInstance(metrics[0], tf.keras.metrics.MeanSquaredError)
+    else:
+      self.assertIsInstance(
+          metrics[0], tf.keras.metrics.SparseCategoricalAccuracy)
+
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+
+    logs = task.validation_step(next(iterator), model, metrics=metrics)
+    loss = logs["loss"].numpy()
+    if num_classes == 1:
+      self.assertAlmostEqual(loss, 42.77483, places=3)
+    else:
+      self.assertAlmostEqual(loss, 3.57627e-6, places=3)
+
  @parameterized.parameters(("matthews_corrcoef", 2),
                            ("pearson_spearman_corr", 1))
  def test_np_metrics(self, metric_type, num_classes):
@@ -160,6 +219,35 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
        train_data=self._train_data_config)
    self._run_task(config)

+  @parameterized.named_parameters(("classification", 5), ("regression", 1))
+  def test_prediction(self, num_classes):
+    task_config = sentence_prediction.SentencePredictionConfig(
+        model=self.get_model_config(num_classes=num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(task_config)
+    model = task.build_model()
+
+    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
+    seq_length = 16
+    num_examples = 100
+    _create_fake_dataset(
+        test_data_path,
+        seq_length=seq_length,
+        num_classes=num_classes,
+        num_examples=num_examples)
+
+    test_data_config = (
+        sentence_prediction_dataloader.SentencePredictionDataConfig(
+            input_path=test_data_path,
+            seq_length=seq_length,
+            is_training=False,
+            label_type="int" if num_classes > 1 else "float",
+            global_batch_size=16,
+            drop_remainder=False))
+
+    predictions = sentence_prediction.predict(task, test_data_config, model)
+    self.assertLen(predictions, num_examples)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tasks/tagging.py
+++ b/official/nlp/tasks/tagging.py
@@ -15,9 +15,10 @@
 # ==============================================================================
 """Tagging (e.g., NER/POS) task."""
 import logging
-from typing import List, Optional
+from typing import List, Optional, Tuple

 import dataclasses
+import orbit

 from seqeval import metrics as seqeval_metrics

@@ -25,6 +26,7 @@ import tensorflow as tf
 import tensorflow_hub as hub

 from official.core import base_task
+from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import encoders
 from official.nlp.data import data_loader_factory
@@ -32,14 +34,22 @@ from official.nlp.modeling import models
 from official.nlp.tasks import utils


+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A base span labeler configuration."""
+  encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+  head_dropout: float = 0.1
+  head_initializer_range: float = 0.02
+
+
 @dataclasses.dataclass
 class TaggingConfig(cfg.TaskConfig):
  """The model config."""
  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
  init_checkpoint: str = ''
  hub_module_url: str = ''
-  model: encoders.TransformerEncoderConfig = (
-      encoders.TransformerEncoderConfig())
+  model: ModelConfig = ModelConfig()

  # The real class names, the order of which should match real label id.
  # Note that a word may be tokenized into multiple word_pieces tokens, and
@@ -93,14 +103,14 @@ class TaggingTask(base_task.Task):
      encoder_network = utils.get_encoder_from_hub(self._hub_module)
    else:
      encoder_network = encoders.instantiate_encoder_from_cfg(
-          self.task_config.model)
+          self.task_config.model.encoder)

    return models.BertTokenClassifier(
        network=encoder_network,
        num_classes=len(self.task_config.class_names),
        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=self.task_config.model.initializer_range),
-        dropout_rate=self.task_config.model.dropout_rate,
+            stddev=self.task_config.model.head_initializer_range),
+        dropout_rate=self.task_config.model.head_dropout,
        output='logits')

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
@@ -113,7 +123,7 @@ class TaggingTask(base_task.Task):
    loss = tf.math.divide_no_nan(numerator_loss, denominator_loss)
    return loss

-  def build_inputs(self, params, input_context=None):
+  def build_inputs(self, params: cfg.DataConfig, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    if params.input_path == 'dummy':

@@ -140,6 +150,11 @@ class TaggingTask(base_task.Task):

    return data_loader_factory.get_data_loader(params).load(input_context)

+  def inference_step(self, inputs, model: tf.keras.Model):
+    """Performs the forward step."""
+    logits = model(inputs, training=False)
+    return {'logits': logits, 'predict_ids': tf.argmax(logits, axis=-1)}
+
  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
    """Validatation step.

@@ -153,12 +168,11 @@ class TaggingTask(base_task.Task):
    """
    features, labels = inputs
    outputs = self.inference_step(features, model)
-    loss = self.build_losses(labels=labels, model_outputs=outputs)
+    loss = self.build_losses(labels=labels, model_outputs=outputs['logits'])

    # Negative label ids are padding labels which should be ignored.
    real_label_index = tf.where(tf.greater_equal(labels, 0))
-    predict_ids = tf.math.argmax(outputs, axis=-1)
-    predict_ids = tf.gather_nd(predict_ids, real_label_index)
+    predict_ids = tf.gather_nd(outputs['predict_ids'], real_label_index)
    label_ids = tf.gather_nd(labels, real_label_index)
    return {
        self.loss: loss,
@@ -212,5 +226,69 @@ class TaggingTask(base_task.Task):
    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
    status = ckpt.restore(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
-    logging.info('finished loading pretrained checkpoint from %s',
+    logging.info('Finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
+
+
+def predict(task: TaggingTask, params: cfg.DataConfig,
+            model: tf.keras.Model) -> Tuple[List[List[int]], List[int]]:
+  """Predicts on the input data.
+
+  Args:
+    task: A `TaggingTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+
+  Returns:
+    A tuple of `predict_ids` and `sentence_ids`, which are list with length
+      of `num_examples`. Each element in `predict_ids` is a sequence of
+      predicted per-word label id, and each element in `sentence_ids` is the
+      sentence id of the corresponding example.
+  """
+
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+
+    def _replicated_step(inputs):
+      """Replicated prediction calculation."""
+      x, y = inputs
+      sentence_ids = x.pop('sentence_id')
+      outputs = task.inference_step(x, model)
+      predict_ids = outputs['predict_ids']
+      label_mask = tf.greater_equal(y, 0)
+      return dict(
+          predict_ids=predict_ids,
+          label_mask=label_mask,
+          sentence_ids=sentence_ids)
+
+    outputs = tf.distribute.get_strategy().run(
+        _replicated_step, args=(next(iterator),))
+    return tf.nest.map_structure(
+        tf.distribute.get_strategy().experimental_local_results, outputs)
+
+  def reduce_fn(state, outputs):
+    """Concatenates model's outputs."""
+    cur_predict_ids, cur_sentence_ids = state
+    for batch_predict_ids, batch_label_mask, batch_sentence_ids in zip(
+        outputs['predict_ids'], outputs['label_mask'],
+        outputs['sentence_ids']):
+      for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip(
+          batch_predict_ids.numpy(), batch_label_mask.numpy(),
+          batch_sentence_ids.numpy()):
+        cur_sentence_ids.append(tmp_sentence_id)
+        cur_predict_ids.append([])
+        assert len(tmp_predict_ids) == len(tmp_label_mask)
+        for i in range(len(tmp_predict_ids)):
+          # Skip the padding label.
+          if tmp_label_mask[i]:
+            cur_predict_ids[-1].append(tmp_predict_ids[i])
+    return cur_predict_ids, cur_sentence_ids
+
+  loop_fn = orbit.utils.create_loop_fn(predict_step)
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  # Set `num_steps` to -1 to exhaust the dataset.
+  predict_ids, sentence_ids = loop_fn(
+      iter(dataset), num_steps=-1, state=([], []), reduce_fn=reduce_fn)
+  return predict_ids, sentence_ids
--- a/official/nlp/tasks/tagging_test.py
+++ b/official/nlp/tasks/tagging_test.py
@@ -16,6 +16,7 @@
 """Tests for official.nlp.tasks.tagging."""
 import functools
 import os
+import numpy as np
 import tensorflow as tf

 from official.nlp.bert import configs
@@ -25,6 +26,29 @@ from official.nlp.data import tagging_data_loader
 from official.nlp.tasks import tagging


+def _create_fake_dataset(output_path, seq_length, num_labels, num_examples):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+
+  def create_int_feature(values):
+    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return f
+
+  for i in range(num_examples):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+    features["label_ids"] = create_int_feature(
+        np.random.random_integers(-1, num_labels - 1, size=(seq_length)))
+    features["sentence_id"] = create_int_feature([i])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+
+
 class TaggingTest(tf.test.TestCase):

  def setUp(self):
@@ -56,7 +80,7 @@ class TaggingTest(tf.test.TestCase):

    config = tagging.TaggingConfig(
        init_checkpoint=saved_path,
-        model=self._encoder_config,
+        model=tagging.ModelConfig(encoder=self._encoder_config),
        train_data=self._train_data_config,
        class_names=["O", "B-PER", "I-PER"])
    task = tagging.TaggingTask(config)
@@ -72,7 +96,7 @@ class TaggingTest(tf.test.TestCase):

  def test_task_with_fit(self):
    config = tagging.TaggingConfig(
-        model=self._encoder_config,
+        model=tagging.ModelConfig(encoder=self._encoder_config),
        train_data=self._train_data_config,
        class_names=["O", "B-PER", "I-PER"])

@@ -115,14 +139,13 @@ class TaggingTest(tf.test.TestCase):
    hub_module_url = self._export_bert_tfhub()
    config = tagging.TaggingConfig(
        hub_module_url=hub_module_url,
-        model=self._encoder_config,
        class_names=["O", "B-PER", "I-PER"],
        train_data=self._train_data_config)
    self._run_task(config)

  def test_seqeval_metrics(self):
    config = tagging.TaggingConfig(
-        model=self._encoder_config,
+        model=tagging.ModelConfig(encoder=self._encoder_config),
        train_data=self._train_data_config,
        class_names=["O", "B-PER", "I-PER"])
    task = tagging.TaggingTask(config)
@@ -141,6 +164,34 @@ class TaggingTest(tf.test.TestCase):
    self.assertCountEqual({"f1", "precision", "recall", "accuracy"},
                          task.reduce_aggregated_logs(aggregated).keys())

+  def test_predict(self):
+    task_config = tagging.TaggingConfig(
+        model=tagging.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(task_config)
+    model = task.build_model()
+
+    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
+    seq_length = 16
+    num_examples = 100
+    _create_fake_dataset(
+        test_data_path,
+        seq_length=seq_length,
+        num_labels=len(task_config.class_names),
+        num_examples=num_examples)
+    test_data_config = tagging_data_loader.TaggingDataConfig(
+        input_path=test_data_path,
+        seq_length=seq_length,
+        is_training=False,
+        global_batch_size=16,
+        drop_remainder=False,
+        include_sentence_id=True)
+
+    predict_ids, sentence_ids = tagging.predict(task, test_data_config, model)
+    self.assertLen(predict_ids, num_examples)
+    self.assertLen(sentence_ids, num_examples)
+

 if __name__ == "__main__":
  tf.test.main()