Merge branch 'master' into move_to_keraslayers_fasterrcnn_fpn_keras_feature_extractor

0cceabfc · Yiming Shi · GitHub · 17821c0d · 39ee0ac9 · 0cceabfc
Unverified Commit 0cceabfc authored Aug 03, 2020 by Yiming Shi Committed by GitHub Aug 03, 2020
20 changed files
--- a/official/nlp/modeling/ops/__init__.py
+++ b/official/nlp/modeling/ops/__init__.py
--- a/official/nlp/modeling/ops/beam_search.py
+++ b/official/nlp/modeling/ops/beam_search.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Beam search to find the translated sequence with the highest probability."""
+import numpy as np
+import tensorflow as tf
+def inf(dtype):
+  """Returns a value close to infinity, but is still finite in `dtype`.
+  This is useful to get a very large value that is still zero when multiplied by
+  zero. The floating-point "Inf" value is NaN when multiplied by zero.
+  Args:
+    dtype: A dtype. The returned value will be finite when casted to this dtype.
+  Returns:
+    A very large value.
+  """
+  if dtype == "float32" or dtype == "bfloat16":
+    return 1e7
+  elif dtype == "float16":
+    # Disable no-member lint error, as the linter thinks np.float16 does not
+    # exist for some reason.
+    return np.finfo(np.float16).max  # pylint: disable=no-member
+  else:
+    raise AssertionError("Invalid dtype: %s" % dtype)
+class _StateKeys(object):
+  """Keys to dictionary storing the state of the beam search loop."""
+  # Variable storing the loop index.
+  CUR_INDEX = "CUR_INDEX"
+  # Top sequences that are alive for each batch item. Alive sequences are ones
+  # that have not generated an EOS token. Sequences that reach EOS are marked as
+  # finished and moved to the FINISHED_SEQ tensor.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1]
+  ALIVE_SEQ = "ALIVE_SEQ"
+  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
+  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
+  # Dictionary of cached values for each alive sequence. The cache stores
+  # the encoder output, attention bias, and the decoder attention output from
+  # the previous iteration.
+  ALIVE_CACHE = "ALIVE_CACHE"
+  # Top finished sequences for each batch item.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
+  # shorter than CUR_INDEX + 1 are padded with 0s.
+  FINISHED_SEQ = "FINISHED_SEQ"
+  # Scores for each finished sequence. Score = log probability / length norm
+  # Shape [batch_size, beam_size]
+  FINISHED_SCORES = "FINISHED_SCORES"
+  # Flags indicating which sequences in the finished sequences are finished.
+  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
+  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
+  FINISHED_FLAGS = "FINISHED_FLAGS"
+def _expand_to_same_rank(tensor, target):
+  """Expands a given tensor to target's rank to be broadcastable.
+  Args:
+    tensor: input tensor to tile. Shape: [b, d1, ..., da]
+    target: target tensor. Shape: [b, d1, ..., da, ..., dn]
+  Returns:
+    Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
+  Raises:
+    ValueError, if the shape rank of rank tensor/target is None.
+  """
+  if tensor.shape.rank is None:
+    raise ValueError("Expect rank for tensor shape, but got None.")
+  if target.shape.rank is None:
+    raise ValueError("Expect rank for target shape, but got None.")
+  with tf.name_scope("expand_rank"):
+    diff_rank = target.shape.rank - tensor.shape.rank
+    for _ in range(diff_rank):
+      tensor = tf.expand_dims(tensor, -1)
+    return tensor
+class SequenceBeamSearch(tf.Module):
+  """Implementation of beam search loop."""
+  def __init__(self,
+               symbols_to_logits_fn,
+               vocab_size,
+               beam_size,
+               alpha,
+               max_decode_length,
+               eos_id,
+               padded_decode,
+               dtype=tf.float32):
+    """Initialize sequence beam search.
+    Args:
+      symbols_to_logits_fn: A function to provide logits, which is the
+        interface to the Transformer model. The passed in arguments are: ids ->
+          A tensor with shape [batch_size * beam_size, index]. index -> A
+          scalar. cache -> A nested dictionary of tensors [batch_size *
+          beam_size, ...].
+        The function must return a tuple of logits and the updated cache: logits
+          -> A tensor with shape [batch * beam_size, vocab_size]. updated cache
+          -> A nested dictionary with the same structure as the input cache.
+      vocab_size: An integer, the size of the vocabulary, used for topk
+        computation.
+      beam_size: An integer, number of beams for beam search.
+      alpha: A float, defining the strength of length normalization.
+      max_decode_length: An integer, the maximum number of steps to decode a
+        sequence.
+      eos_id: An integer. ID of end of sentence token.
+      padded_decode: A bool, indicating if max_sequence_length padding is used
+        for beam search.
+      dtype: A tensorflow data type used for score computation. The default is
+        tf.float32.
+    """
+    self.symbols_to_logits_fn = symbols_to_logits_fn
+    self.vocab_size = vocab_size
+    self.beam_size = beam_size
+    self.alpha = alpha
+    self.max_decode_length = max_decode_length
+    self.eos_id = eos_id
+    self.padded_decode = padded_decode
+    self.dtype = tf.as_dtype(dtype)
+  def search(self, initial_ids, initial_cache):
+    """Beam search for sequences with highest scores.
+    Args:
+      initial_ids: initial ids to pass into the symbols_to_logits_fn. int tensor
+        with shape [batch_size, 1]
+      initial_cache: dictionary storing values to be passed into the
+        symbols_to_logits_fn.
+    Returns:
+      finished_seq and finished_scores.
+    """
+    batch_size = (
+        initial_ids.shape.as_list()[0]
+        if self.padded_decode else tf.shape(initial_ids)[0])
+    state, state_shapes = self._create_initial_state(initial_ids, initial_cache,
+                                                     batch_size)
+    def _grow_alive_seq(state):
+      """Grow alive sequences by one token, collect top 2*beam_size sequences.
+      2*beam_size sequences are collected because some sequences may have
+      reached the EOS token. 2*beam_size ensures that at least beam_size
+      sequences are still alive.
+      Args:
+        state: A dictionary with the current loop state.
+      Returns:
+        Tuple of
+        (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
+         Scores of returned sequences [batch_size, 2 * beam_size],
+         New alive cache, for each of the 2 * beam_size sequences)
+      """
+      i = state[_StateKeys.CUR_INDEX]
+      alive_seq = state[_StateKeys.ALIVE_SEQ]
+      alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+      alive_cache = state[_StateKeys.ALIVE_CACHE]
+      beams_to_keep = 2 * self.beam_size
+      # Get logits for the next candidate IDs for the alive sequences. Get the
+      # new cache values at the same time.
+      if self.padded_decode:
+        flat_ids = tf.reshape(
+            tf.slice(alive_seq, [0, 0, i], [batch_size, self.beam_size, 1]),
+            [batch_size * self.beam_size, -1])
+      else:
+        flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
+      flat_cache = tf.nest.map_structure(_flatten_beam_dim, alive_cache)
+      flat_logits, flat_cache = self.symbols_to_logits_fn(
+          flat_ids, i, flat_cache)
+      # Unflatten logits to shape [batch_size, beam_size, vocab_size]
+      logits = _unflatten_beam_dim(flat_logits, batch_size, self.beam_size)
+      new_cache = tf.nest.map_structure(
+          lambda t: _unflatten_beam_dim(t, batch_size, self.beam_size),
+          flat_cache)
+      # Convert logits to normalized log probs
+      candidate_log_probs = _log_prob_from_logits(logits)
+      # Calculate new log probabilities if each of the alive sequences were
+      # extended # by the the candidate IDs.
+      # Shape [batch_size, beam_size, vocab_size]
+      log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
+      # Each batch item has beam_size * vocab_size candidate sequences. For each
+      # batch item, get the k candidates with the highest log probabilities.
+      flat_log_probs = tf.reshape(log_probs,
+                                  [-1, self.beam_size * self.vocab_size])
+      topk_log_probs, topk_indices = tf.nn.top_k(
+          flat_log_probs, k=beams_to_keep)
+      # Extract the alive sequences that generate the highest log probabilities
+      # after being extended.
+      topk_beam_indices = topk_indices // self.vocab_size
+      topk_seq, new_cache = _gather_beams([alive_seq, new_cache],
+                                          topk_beam_indices, batch_size,
+                                          beams_to_keep)
+      # Append the most probable IDs to the topk sequences
+      topk_ids = topk_indices % self.vocab_size
+      if self.padded_decode:
+        topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
+        # TODO(b/145533236, hongkuny): Reverts once TF fix the validation.
+        topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]],
+                                               tf.expand_dims(topk_ids, axis=0))
+        topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
+      else:
+        topk_seq = tf.concat(
+            [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
+      return topk_seq, topk_log_probs, topk_ids, new_cache
+    def _get_new_alive_state(new_seq, new_log_probs, new_finished_flags,
+                             new_cache):
+      """Gather the top k sequences that are still alive.
+      Args:
+        new_seq: New sequences generated by growing the current alive sequences
+          int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
+        new_log_probs: Log probabilities of new sequences float32 tensor with
+          shape [batch_size, beam_size]
+        new_finished_flags: A boolean Tensor indicates which sequences are live
+          inside the beam.
+        new_cache: Dict of cached values for each sequence.
+      Returns:
+        Dictionary with alive keys from _StateKeys:
+          {Top beam_size sequences that are still alive (don't end with eos_id)
+           Log probabilities of top alive sequences
+           Dict cache storing decoder states for top alive sequences}
+      """
+      # To prevent finished sequences from being considered, set log probs to
+      # -inf.
+      new_log_probs += tf.cast(new_finished_flags,
+                               self.dtype) * -inf(self.dtype)
+      top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
+          [new_seq, new_log_probs, new_cache], new_log_probs, batch_size,
+          self.beam_size)
+      return {
+          _StateKeys.ALIVE_SEQ: top_alive_seq,
+          _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
+          _StateKeys.ALIVE_CACHE: top_alive_cache
+      }
+    def _get_new_finished_state(state, new_seq, new_log_probs,
+                                new_finished_flags):
+      """Combine new and old finished sequences, and gather the top k sequences.
+      Args:
+        state: A dictionary with the current loop state.
+        new_seq: New sequences generated by growing the current alive sequences
+          int32 tensor with shape [batch_size, beam_size, i + 1]
+        new_log_probs: Log probabilities of new sequences float32 tensor with
+          shape [batch_size, beam_size]
+        new_finished_flags: A boolean Tensor indicates which sequences are live
+          inside the beam.
+      Returns:
+        Dictionary with finished keys from _StateKeys:
+          {Top beam_size finished sequences based on score,
+           Scores of finished sequences,
+           Finished flags of finished sequences}
+      """
+      i = state[_StateKeys.CUR_INDEX]
+      finished_seq = state[_StateKeys.FINISHED_SEQ]
+      finished_scores = state[_StateKeys.FINISHED_SCORES]
+      finished_flags = state[_StateKeys.FINISHED_FLAGS]
+      # First append a column of 0-ids to finished_seq to increment the length.
+      # New shape of finished_seq: [batch_size, beam_size, i + 1]
+      if not self.padded_decode:
+        finished_seq = tf.concat(
+            [finished_seq,
+             tf.zeros([batch_size, self.beam_size, 1], tf.int32)],
+            axis=2)
+      # Calculate new seq scores from log probabilities.
+      length_norm = _length_normalization(self.alpha, i + 1, dtype=self.dtype)
+      new_scores = new_log_probs / length_norm
+      # Set the scores of the still-alive seq in new_seq to large negative
+      # values.
+      new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) *
+                     -inf(self.dtype))
+      # Combine sequences, scores, and flags.
+      finished_seq = tf.concat([finished_seq, new_seq], axis=1)
+      finished_scores = tf.concat([finished_scores, new_scores], axis=1)
+      finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
+      # Return the finished sequences with the best scores.
+      top_finished_seq, top_finished_scores, top_finished_flags = (
+          _gather_topk_beams([finished_seq, finished_scores, finished_flags],
+                             finished_scores, batch_size, self.beam_size))
+      return {
+          _StateKeys.FINISHED_SEQ: top_finished_seq,
+          _StateKeys.FINISHED_SCORES: top_finished_scores,
+          _StateKeys.FINISHED_FLAGS: top_finished_flags
+      }
+    def _search_step(state):
+      """Beam search loop body.
+      Grow alive sequences by a single ID. Sequences that have reached the EOS
+      token are marked as finished. The alive and finished sequences with the
+      highest log probabilities and scores are returned.
+      A sequence's finished score is calculating by dividing the log probability
+      by the length normalization factor. Without length normalization, the
+      search is more likely to return shorter sequences.
+      Args:
+        state: A dictionary with the current loop state.
+      Returns:
+        new state dictionary.
+      """
+      # Grow alive sequences by one token.
+      new_seq, new_log_probs, topk_ids, new_cache = _grow_alive_seq(state)
+      new_finished_flags = tf.equal(topk_ids, self.eos_id)
+      # Collect top beam_size alive sequences
+      alive_state = _get_new_alive_state(new_seq, new_log_probs,
+                                         new_finished_flags, new_cache)
+      # Combine newly finished sequences with existing finished sequences, and
+      # collect the top k scoring sequences.
+      finished_state = _get_new_finished_state(state, new_seq, new_log_probs,
+                                               new_finished_flags)
+      # Increment loop index and create new state dictionary
+      new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
+      new_state.update(alive_state)
+      new_state.update(finished_state)
+      return [new_state]
+    finished_state = tf.nest.map_structure(
+        tf.stop_gradient,
+        tf.while_loop(
+            self._continue_search,
+            _search_step,
+            loop_vars=[state],
+            shape_invariants=[state_shapes],
+            parallel_iterations=1))
+    finished_state = finished_state[0]
+    return self._process_finished_state(finished_state)
+  def _process_finished_state(self, finished_state):
+    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
+    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
+    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
+    finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
+    finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
+    # TF2 changes tf.where behavior. Should make parameters broadcastable.
+    finished_cond = tf.reduce_any(finished_flags, 1, name="finished_cond")
+    seq_cond = _expand_to_same_rank(finished_cond, finished_seq)
+    score_cond = _expand_to_same_rank(finished_cond, finished_scores)
+    # Account for corner case where there are no finished sequences for a
+    # particular batch item. In that case, return alive sequences for that batch
+    # item.
+    finished_seq = tf.where(seq_cond, finished_seq, alive_seq)
+    finished_scores = tf.where(score_cond, finished_scores, alive_log_probs)
+    return finished_seq, finished_scores
+  def _create_initial_state(self, initial_ids, initial_cache, batch_size):
+    """Return initial state dictionary and its shape invariants."""
+    for key, value in initial_cache.items():
+      for inner_value in tf.nest.flatten(value):
+        if inner_value.dtype != self.dtype:
+          raise TypeError(
+              "initial_cache element for key '%s' has dtype %s that does not "
+              "match SequenceBeamSearch's dtype of %s. Value: %s" %
+              (key, value.dtype.name, self.dtype.name, inner_value))
+    # Current loop index (starts at 0)
+    cur_index = tf.constant(0)
+    # Create alive sequence with shape [batch_size, beam_size, 1]
+    alive_seq = _expand_to_beam_size(initial_ids, self.beam_size)
+    alive_seq = tf.expand_dims(alive_seq, axis=2)
+    if self.padded_decode:
+      alive_seq = tf.tile(alive_seq, [1, 1, self.max_decode_length + 1])
+    # Create tensor for storing initial log probabilities.
+    # Assume initial_ids are prob 1.0
+    initial_log_probs = tf.constant([[0.] + [-float("inf")] *
+                                     (self.beam_size - 1)],
+                                    dtype=self.dtype)
+    alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
+    # Expand all values stored in the dictionary to the beam size, so that each
+    # beam has a separate cache.
+    alive_cache = tf.nest.map_structure(
+        lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
+    # Initialize tensor storing finished sequences with filler values.
+    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+    # Set scores of the initial finished seqs to negative infinity.
+    finished_scores = tf.ones([batch_size, self.beam_size],
+                              dtype=self.dtype) * -inf(self.dtype)
+    # Initialize finished flags with all False values.
+    finished_flags = tf.zeros([batch_size, self.beam_size], tf.bool)
+    # Create state dictionary
+    state = {
+        _StateKeys.CUR_INDEX: cur_index,
+        _StateKeys.ALIVE_SEQ: alive_seq,
+        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+        _StateKeys.ALIVE_CACHE: alive_cache,
+        _StateKeys.FINISHED_SEQ: finished_seq,
+        _StateKeys.FINISHED_SCORES: finished_scores,
+        _StateKeys.FINISHED_FLAGS: finished_flags
+    }
+    # Create state invariants for each value in the state dictionary. Each
+    # dimension must be a constant or None. A None dimension means either:
+    #   1) the dimension's value is a tensor that remains the same but may
+    #      depend on the input sequence to the model (e.g. batch size).
+    #   2) the dimension may have different values on different iterations.
+    if self.padded_decode:
+      state_shape_invariants = {
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.beam_size, self.max_decode_length + 1]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([batch_size, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(_get_shape, alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.beam_size, self.max_decode_length + 1]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([batch_size, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([batch_size, self.beam_size])
+      }
+    else:
+      state_shape_invariants = {
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(_get_shape_keep_last_dim, alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([None, self.beam_size])
+      }
+    return state, state_shape_invariants
+  def _continue_search(self, state):
+    """Return whether to continue the search loop.
+    The loops should terminate when
+      1) when decode length has been reached, or
+      2) when the worst score in the finished sequences is better than the best
+         score in the alive sequences (i.e. the finished sequences are provably
+         unchanging)
+    Args:
+      state: A dictionary with the current loop state.
+    Returns:
+      Bool tensor with value True if loop should continue, False if loop should
+      terminate.
+    """
+    i = state[_StateKeys.CUR_INDEX]
+    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+    finished_scores = state[_StateKeys.FINISHED_SCORES]
+    finished_flags = state[_StateKeys.FINISHED_FLAGS]
+    not_at_max_decode_length = tf.less(i, self.max_decode_length)
+    # Calculate largest length penalty (the larger penalty, the better score).
+    max_length_norm = _length_normalization(
+        self.alpha, self.max_decode_length, dtype=self.dtype)
+    # Get the best possible scores from alive sequences.
+    best_alive_scores = alive_log_probs[:, 0] / max_length_norm
+    # Compute worst score in finished sequences for each batch element
+    finished_scores *= tf.cast(finished_flags,
+                               self.dtype)  # set filler scores to zero
+    lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
+    # If there are no finished sequences in a batch element, then set the lowest
+    # finished score to -INF for that element.
+    finished_batches = tf.reduce_any(finished_flags, 1)
+    lowest_finished_scores += ((1.0 - tf.cast(finished_batches, self.dtype)) *
+                               -inf(self.dtype))
+    worst_finished_score_better_than_best_alive_score = tf.reduce_all(
+        tf.greater(lowest_finished_scores, best_alive_scores))
+    return tf.logical_and(
+        not_at_max_decode_length,
+        tf.logical_not(worst_finished_score_better_than_best_alive_score))
+def sequence_beam_search(symbols_to_logits_fn,
+                         initial_ids,
+                         initial_cache,
+                         vocab_size,
+                         beam_size,
+                         alpha,
+                         max_decode_length,
+                         eos_id,
+                         padded_decode=False,
+                         dtype="float32"):
+  """Search for sequence of subtoken ids with the largest probability.
+  Args:
+    symbols_to_logits_fn: A function that takes in ids, index, and cache as
+      arguments. The passed in arguments will have shape: ids -> A tensor with
+        shape [batch_size * beam_size, index]. index -> A scalar. cache -> A
+        nested dictionary of tensors [batch_size * beam_size, ...].
+      The function must return a tuple of logits and new cache: logits -> A
+        tensor with shape [batch * beam_size, vocab_size]. new cache -> A nested
+        dictionary with the same shape/structure as the inputted cache.
+    initial_ids: An int32 tensor with shape [batch_size]. Starting ids for each
+      batch item.
+    initial_cache: A dictionary, containing starting decoder variables
+      information.
+    vocab_size: An integer, the size of tokens.
+    beam_size: An integer, the number of beams.
+    alpha: A float, defining the strength of length normalization.
+    max_decode_length: An integer, the maximum length to decoded a sequence.
+    eos_id: An integer, ID of eos token, used to determine when a sequence has
+      finished.
+    padded_decode: A bool, indicating if max_sequence_length padding is used for
+      beam search.
+    dtype: A tensorflow data type used for score computation. The default is
+      tf.float32.
+  Returns:
+    Top decoded sequences [batch_size, beam_size, max_decode_length]
+    sequence scores [batch_size, beam_size]
+  """
+  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, beam_size, alpha,
+                           max_decode_length, eos_id, padded_decode, dtype)
+  return sbs.search(initial_ids, initial_cache)
+def _log_prob_from_logits(logits):
+  return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
+def _length_normalization(alpha, length, dtype=tf.float32):
+  """Return length normalization factor."""
+  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), alpha)
+def _expand_to_beam_size(tensor, beam_size):
+  """Tiles a given tensor by beam_size.
+  Args:
+    tensor: tensor to tile [batch_size, ...]
+    beam_size: How much to tile the tensor by.
+  Returns:
+    Tiled tensor [batch_size, beam_size, ...]
+  """
+  tensor = tf.expand_dims(tensor, axis=1)
+  tile_dims = [1] * tensor.shape.ndims
+  tile_dims[1] = beam_size
+  return tf.tile(tensor, tile_dims)
+def _shape_list(tensor):
+  """Return a list of the tensor's shape, and ensure no None values in list."""
+  # Get statically known shape (may contain None's for unknown dimensions)
+  shape = tensor.get_shape().as_list()
+  # Ensure that the shape values are not None
+  dynamic_shape = tf.shape(tensor)
+  for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
+    if shape[i] is None:
+      shape[i] = dynamic_shape[i]
+  return shape
+def _get_shape_keep_last_dim(tensor):
+  shape_list = _shape_list(tensor)
+  # Only the last
+  for i in range(len(shape_list) - 1):
+    shape_list[i] = None
+  if isinstance(shape_list[-1], tf.Tensor):
+    shape_list[-1] = None
+  return tf.TensorShape(shape_list)
+def _get_shape(tensor):
+  """Return the shape of the input tensor."""
+  return tf.TensorShape(_shape_list(tensor))
+def _flatten_beam_dim(tensor):
+  """Reshapes first two dimensions in to single dimension.
+  Args:
+    tensor: Tensor to reshape of shape [A, B, ...]
+  Returns:
+    Reshaped tensor of shape [A*B, ...]
+  """
+  shape = _shape_list(tensor)
+  shape[0] *= shape[1]
+  shape.pop(1)  # Remove beam dim
+  return tf.reshape(tensor, shape)
+def _unflatten_beam_dim(tensor, batch_size, beam_size):
+  """Reshapes first dimension back to [batch_size, beam_size].
+  Args:
+    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
+    batch_size: Tensor, original batch size.
+    beam_size: int, original beam size.
+  Returns:
+    Reshaped tensor of shape [batch_size, beam_size, ...]
+  """
+  shape = _shape_list(tensor)
+  new_shape = [batch_size, beam_size] + shape[1:]
+  return tf.reshape(tensor, new_shape)
+def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
+  """Gather beams from nested structure of tensors.
+  Each tensor in nested represents a batch of beams, where beam refers to a
+  single search state (beam search involves searching through multiple states
+  in parallel).
+  This function is used to gather the top beams, specified by
+  beam_indices, from the nested tensors.
+  Args:
+    nested: Nested structure (tensor, list, tuple or dict) containing tensors
+      with shape [batch_size, beam_size, ...].
+    beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
+      value in beam_indices must be between [0, beam_size), and are not
+      necessarily unique.
+    batch_size: int size of batch
+    new_beam_size: int number of beams to be pulled from the nested tensors.
+  Returns:
+    Nested structure containing tensors with shape
+      [batch_size, new_beam_size, ...]
+  """
+  # Computes the i'th coodinate that contains the batch index for gather_nd.
+  # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
+  batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
+  batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
+  # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
+  # with shape [batch_size, beam_size, 2], where the last dimension contains
+  # the (i, j) gathering coordinates.
+  coordinates = tf.stack([batch_pos, beam_indices], axis=2)
+  return tf.nest.map_structure(lambda state: tf.gather_nd(state, coordinates),
+                               nested)
+def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
+  """Gather top beams from nested structure."""
+  _, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size)
+  return _gather_beams(nested, topk_indexes, batch_size, beam_size)
--- a/official/nlp/transformer/beam_search_v1_test.py
+++ b/official/nlp/transformer/beam_search_v1_test.py
@@ -14,33 +14,19 @@
 # ==============================================================================
 """Test beam search helper methods."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
-from official.nlp.transformer import beam_search_v1 as beam_search
+from official.nlp.modeling.ops import beam_search
 class BeamSearchHelperTests(tf.test.TestCase):
-  def setUp(self):
-    super(BeamSearchHelperTests, self).setUp()
-    tf.compat.v1.disable_eager_execution()
  def test_expand_to_beam_size(self):
    x = tf.ones([7, 4, 2, 5])
    x = beam_search._expand_to_beam_size(x, 3)
-    with self.session() as sess:
+    shape = tf.shape(x)
-      shape = sess.run(tf.shape(x))
    self.assertAllEqual([7, 3, 4, 2, 5], shape)
-  def test_shape_list(self):
-    y = tf.compat.v1.placeholder(dtype=tf.int32, shape=[])
-    x = tf.ones([7, y, 2, 5])
-    shape = beam_search._shape_list(x)
-    self.assertIsInstance(shape[0], int)
-    self.assertIsInstance(shape[1], tf.Tensor)
-    self.assertIsInstance(shape[2], int)
-    self.assertIsInstance(shape[3], int)
  def test_get_shape_keep_last_dim(self):
    y = tf.constant(4.0)
    x = tf.ones([7, tf.cast(tf.sqrt(y), tf.int32), 2, 5])
@@ -51,16 +37,12 @@ class BeamSearchHelperTests(tf.test.TestCase):
  def test_flatten_beam_dim(self):
    x = tf.ones([7, 4, 2, 5])
    x = beam_search._flatten_beam_dim(x)
-    with self.session() as sess:
+    self.assertAllEqual([28, 2, 5], tf.shape(x))
-      shape = sess.run(tf.shape(x))
-    self.assertAllEqual([28, 2, 5], shape)
  def test_unflatten_beam_dim(self):
    x = tf.ones([28, 2, 5])
    x = beam_search._unflatten_beam_dim(x, 7, 4)
-    with self.session() as sess:
+    self.assertAllEqual([7, 4, 2, 5], tf.shape(x))
-      shape = sess.run(tf.shape(x))
-    self.assertAllEqual([7, 4, 2, 5], shape)
  def test_gather_beams(self):
    x = tf.reshape(tf.range(24), [2, 3, 4])
@@ -73,9 +55,6 @@ class BeamSearchHelperTests(tf.test.TestCase):
    #                  [20 21 22 23]]]
    y = beam_search._gather_beams(x, [[1, 2], [0, 2]], 2, 2)
-    with self.session() as sess:
-      y = sess.run(y)
    self.assertAllEqual([[[4, 5, 6, 7],
                          [8, 9, 10, 11]],
                         [[12, 13, 14, 15],
@@ -87,9 +66,6 @@ class BeamSearchHelperTests(tf.test.TestCase):
    x_scores = [[0, 1, 1], [1, 0, 1]]
    y = beam_search._gather_topk_beams(x, x_scores, 2, 2)
-    with self.session() as sess:
-      y = sess.run(y)
    self.assertAllEqual([[[4, 5, 6, 7],
                          [8, 9, 10, 11]],
                         [[12, 13, 14, 15],

--- a/official/nlp/nhnet/decoder.py
+++ b/official/nlp/nhnet/decoder.py
@@ -22,151 +22,10 @@ from __future__ import print_function
 import tensorflow as tf
 from official.modeling import tf_utils
 from official.nlp.modeling import layers
-from official.nlp.nhnet import multi_channel_attention
+from official.nlp.modeling.layers import transformer
 from official.nlp.transformer import model_utils as transformer_utils
-class TransformerDecoderBlock(tf.keras.layers.Layer):
-  """Single transformer layer for decoder.
-  It has three sub-layers:
-  (1) a multi-head self-attention mechanism.
-  (2) a encoder-decoder attention.
-  (3) a positionwise fully connected feed-forward network.
-  """
-  def __init__(self,
-               hidden_size=768,
-               num_attention_heads=12,
-               intermediate_size=3072,
-               intermediate_activation="gelu",
-               hidden_dropout_prob=0.0,
-               attention_probs_dropout_prob=0.0,
-               initializer_range=0.02,
-               multi_channel_cross_attention=False,
-               **kwargs):
-    super(TransformerDecoderBlock, self).__init__(**kwargs)
-    self.hidden_size = hidden_size
-    self.num_attention_heads = num_attention_heads
-    self.intermediate_size = intermediate_size
-    self.intermediate_activation = tf_utils.get_activation(
-        intermediate_activation)
-    self.hidden_dropout_prob = hidden_dropout_prob
-    self.attention_probs_dropout_prob = attention_probs_dropout_prob
-    self.multi_channel_cross_attention = multi_channel_cross_attention
-    self._kernel_initializer = tf.keras.initializers.TruncatedNormal(
-        stddev=initializer_range)
-    self._bias_initializer = tf.keras.initializers.get("zeros")
-    if self.multi_channel_cross_attention:
-      self._cross_attention_cls = multi_channel_attention.MultiChannelAttention
-    else:
-      self._cross_attention_cls = layers.MultiHeadAttention
-    if self.hidden_size % self.num_attention_heads != 0:
-      raise ValueError(
-          "The hidden size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (self.hidden_size, self.num_attention_heads))
-    self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
-  def build(self, input_shape):
-    # Self attention.
-    self.self_attention = layers.CachedAttention(
-        num_heads=self.num_attention_heads,
-        key_size=self.attention_head_size,
-        dropout=self.attention_probs_dropout_prob,
-        kernel_initializer=self._kernel_initializer,
-        name="self_attention")
-    self.self_attention_output_dense = layers.DenseEinsum(
-        output_shape=self.hidden_size,
-        num_summed_dimensions=2,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        name="self_attention_output")
-    self.self_attention_dropout = tf.keras.layers.Dropout(
-        rate=self.hidden_dropout_prob)
-    self.self_attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="self_attention_layer_norm", axis=-1, epsilon=1e-12))
-    # Encoder-decoder attention.
-    self.encdec_attention = self._cross_attention_cls(
-        num_heads=self.num_attention_heads,
-        key_size=self.attention_head_size,
-        dropout=self.attention_probs_dropout_prob,
-        output_shape=self.hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        name="attention/encdec")
-    self.encdec_attention_dropout = tf.keras.layers.Dropout(
-        rate=self.hidden_dropout_prob)
-    self.encdec_attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12))
-    # Feed-forward projection.
-    self.intermediate_dense = layers.DenseEinsum(
-        output_shape=self.intermediate_size,
-        activation=None,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        name="intermediate")
-    self.intermediate_activation_layer = tf.keras.layers.Activation(
-        self.intermediate_activation)
-    self.output_dense = layers.DenseEinsum(
-        output_shape=self.hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        name="output")
-    self.output_dropout = tf.keras.layers.Dropout(rate=self.hidden_dropout_prob)
-    self.output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm", axis=-1, epsilon=1e-12)
-    super(TransformerDecoderBlock, self).build(input_shape)
-  def common_layers_with_encoder(self):
-    """Gets layer objects that can make a Transformer encoder block."""
-    return [
-        self.self_attention, self.self_attention_layer_norm,
-        self.intermediate_dense, self.output_dense, self.output_layer_norm
-    ]
-  def call(self, inputs, cache=None, decode_loop_step=None):
-    if self.multi_channel_cross_attention:
-      if len(inputs) != 5:
-        raise ValueError(
-            "TransformerDecoderBlock must have 5 inputs, when it uses "
-            "multi_channel_cross_attention. But it got: %d" % len(inputs))
-    elif len(inputs) != 4:
-      raise ValueError(
-          "TransformerDecoderBlock must have 4 inputs, but it got: %d" %
-          len(inputs))
-    input_tensor, memory, attention_mask, self_attention_mask = inputs[:4]
-    self_attention_inputs = [input_tensor, input_tensor]
-    self_attention_output, cache = self.self_attention(
-        self_attention_inputs,
-        attention_mask=self_attention_mask,
-        cache=cache,
-        decode_loop_step=decode_loop_step)
-    self_attention_output = self.self_attention_dropout(self_attention_output)
-    self_attention_output = self.self_attention_layer_norm(
-        input_tensor + self_attention_output)
-    cross_attn_inputs = [self_attention_output, memory]
-    if self.multi_channel_cross_attention:
-      # Accesses the 5-th input tensor for the doc-attention probabilities.
-      cross_attn_inputs.append(inputs[-1])
-    attention_output = self.encdec_attention(cross_attn_inputs, attention_mask)
-    attention_output = self.encdec_attention_dropout(attention_output)
-    attention_output = self.encdec_attention_layer_norm(self_attention_output +
-                                                        attention_output)
-    intermediate_output = self.intermediate_dense(attention_output)
-    intermediate_output = self.intermediate_activation_layer(
-        intermediate_output)
-    layer_output = self.output_dense(intermediate_output)
-    layer_output = self.output_dropout(layer_output)
-    layer_output = self.output_layer_norm(layer_output + attention_output)
-    return layer_output, cache
 class TransformerDecoder(tf.keras.layers.Layer):
  """Transformer decoder stack."""
@@ -200,14 +59,14 @@ class TransformerDecoder(tf.keras.layers.Layer):
    self.layers = []
    for i in range(self.num_hidden_layers):
      self.layers.append(
-          TransformerDecoderBlock(
+          transformer.TransformerDecoderLayer(
-              hidden_size=self.hidden_size,
              num_attention_heads=self.num_attention_heads,
              intermediate_size=self.intermediate_size,
              intermediate_activation=self.intermediate_activation,
-              hidden_dropout_prob=self.hidden_dropout_prob,
+              dropout_rate=self.hidden_dropout_prob,
-              attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+              attention_dropout_rate=self.attention_probs_dropout_prob,
-              initializer_range=self.initializer_range,
+              kernel_initializer=tf.keras.initializers.TruncatedNormal(
+                  stddev=self.initializer_range),
              multi_channel_cross_attention=self.multi_channel_cross_attention,
              name=("layer_%d" % i)))
    super(TransformerDecoder, self).build(unused_input_shapes)

--- a/official/nlp/nhnet/decoder_test.py
+++ b/official/nlp/nhnet/decoder_test.py
@@ -26,17 +26,6 @@ from official.nlp.nhnet import decoder
 from official.nlp.nhnet import utils
-def _create_cache(batch_size, init_decode_length, num_heads, head_size):
-  return {
-      "key":
-          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
-                   dtype=tf.float32),
-      "value":
-          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
-                   dtype=tf.float32)
-  }
 class DecoderTest(tf.test.TestCase):
  def setUp(self):
@@ -56,26 +45,6 @@ class DecoderTest(tf.test.TestCase):
    decoder_block.build(None)
    self.assertEqual(len(decoder_block.layers), self._config.num_hidden_layers)
-  def test_decoder_block_with_cache(self):
-    decoder_block = decoder.TransformerDecoderBlock(
-        hidden_size=self._config.hidden_size,
-        num_attention_heads=self._config.num_attention_heads,
-        intermediate_size=self._config.intermediate_size,
-        intermediate_activation=self._config.hidden_act,
-        hidden_dropout_prob=self._config.hidden_dropout_prob,
-        attention_probs_dropout_prob=self._config.attention_probs_dropout_prob,
-        initializer_range=self._config.initializer_range)
-    # Forward path.
-    dummy_tensor = tf.zeros([2, 4, self._config.hidden_size], dtype=tf.float32)
-    dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
-    inputs = [dummy_tensor, dummy_tensor, dummy_mask, dummy_mask]
-    cache = _create_cache(
-        2, 0, self._config.num_attention_heads,
-        self._config.hidden_size // self._config.num_attention_heads)
-    output, cache = decoder_block(inputs, cache)
-    self.assertEqual(output.shape, (2, 4, self._config.hidden_size))
-    self.assertEqual(cache["value"].shape, (2, 4, 2, 8))
  def test_bert_decoder(self):
    seq_length = 10
    encoder_input_ids = tf.keras.layers.Input(

--- a/official/nlp/nhnet/models.py
+++ b/official/nlp/nhnet/models.py
@@ -27,11 +27,11 @@ from typing import Optional, Text
 from official.modeling import tf_utils
 from official.modeling.hyperparams import params_dict
 from official.nlp.modeling import networks
+from official.nlp.modeling.layers import multi_channel_attention
 from official.nlp.nhnet import configs
 from official.nlp.nhnet import decoder
-from official.nlp.nhnet import multi_channel_attention
 from official.nlp.nhnet import utils
-from official.nlp.transformer import beam_search
+from official.nlp.modeling.ops import beam_search
 def embedding_linear(embedding_matrix, x):
@@ -273,7 +273,7 @@ class NHNet(Bert2Bert):
  def __init__(self, params, bert_layer, decoder_layer, name=None):
    super(NHNet, self).__init__(params, bert_layer, decoder_layer, name=name)
-    self.doc_attention = multi_channel_attention.DocAttention(
+    self.doc_attention = multi_channel_attention.VotingAttention(
        num_heads=params.num_decoder_attn_heads,
        head_size=params.hidden_size // params.num_decoder_attn_heads)
@@ -413,7 +413,6 @@ def get_bert2bert_layers(params: configs.BERT2BERTConfig):
      activation=tf_utils.get_activation(bert_config.hidden_act),
      dropout_rate=bert_config.hidden_dropout_prob,
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
-      sequence_length=None,
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(

--- a/official/nlp/tasks/electra_task.py
+++ b/official/nlp/tasks/electra_task.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ELECTRA pretraining task (Joint Masked LM and Replaced Token Detection)."""
+import dataclasses
+import tensorflow as tf
+from official.core import base_task
+from official.core import task_factory
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.configs import bert
+from official.nlp.configs import electra
+from official.nlp.data import pretrain_dataloader
+@dataclasses.dataclass
+class ELECTRAPretrainConfig(cfg.TaskConfig):
+  """The model config."""
+  model: electra.ELECTRAPretrainerConfig = electra.ELECTRAPretrainerConfig(
+      cls_heads=[
+          bert.ClsHeadConfig(
+              inner_dim=768,
+              num_classes=2,
+              dropout_rate=0.1,
+              name='next_sentence')
+      ])
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+@task_factory.register_task_cls(ELECTRAPretrainConfig)
+class ELECTRAPretrainTask(base_task.Task):
+  """ELECTRA Pretrain Task (Masked LM + Replaced Token Detection)."""
+  def build_model(self):
+    return electra.instantiate_pretrainer_from_cfg(
+        self.task_config.model)
+  def build_losses(self,
+                   labels,
+                   model_outputs,
+                   metrics,
+                   aux_losses=None) -> tf.Tensor:
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    # generator lm and (optional) nsp loss.
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        labels['masked_lm_ids'],
+        tf.cast(model_outputs['lm_outputs'], tf.float32),
+        from_logits=True)
+    lm_label_weights = labels['masked_lm_weights']
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
+    metrics['lm_example_loss'].update_state(mlm_loss)
+    if 'next_sentence_labels' in labels:
+      sentence_labels = labels['next_sentence_labels']
+      sentence_outputs = tf.cast(
+          model_outputs['sentence_outputs'], dtype=tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels,
+          sentence_outputs,
+          from_logits=True)
+      metrics['next_sentence_loss'].update_state(sentence_loss)
+      total_loss = mlm_loss + sentence_loss
+    else:
+      total_loss = mlm_loss
+    # discriminator replaced token detection (rtd) loss.
+    rtd_logits = model_outputs['disc_logits']
+    rtd_labels = tf.cast(model_outputs['disc_label'], tf.float32)
+    input_mask = tf.cast(labels['input_mask'], tf.float32)
+    rtd_ind_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        logits=rtd_logits, labels=rtd_labels)
+    rtd_numerator = tf.reduce_sum(input_mask * rtd_ind_loss)
+    rtd_denominator = tf.reduce_sum(input_mask)
+    rtd_loss = tf.math.divide_no_nan(rtd_numerator, rtd_denominator)
+    metrics['discriminator_loss'].update_state(rtd_loss)
+    total_loss = total_loss + \
+        self.task_config.model.discriminator_loss_weight * rtd_loss
+    if aux_losses:
+      total_loss += tf.add_n(aux_losses)
+    metrics['total_loss'].update_state(total_loss)
+    return total_loss
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for pretraining."""
+    if params.input_path == 'dummy':
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
+        return dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids,
+            masked_lm_positions=dummy_lm,
+            masked_lm_ids=dummy_lm,
+            masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32),
+            next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32))
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+    return pretrain_dataloader.BertPretrainDataLoader(params).load(
+        input_context)
+  def build_metrics(self, training=None):
+    del training
+    metrics = [
+        tf.keras.metrics.SparseCategoricalAccuracy(name='masked_lm_accuracy'),
+        tf.keras.metrics.Mean(name='lm_example_loss'),
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='discriminator_accuracy'),
+    ]
+    if self.task_config.train_data.use_next_sentence_label:
+      metrics.append(
+          tf.keras.metrics.SparseCategoricalAccuracy(
+              name='next_sentence_accuracy'))
+      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
+    metrics.append(tf.keras.metrics.Mean(name='discriminator_loss'))
+    metrics.append(tf.keras.metrics.Mean(name='total_loss'))
+    return metrics
+  def process_metrics(self, metrics, labels, model_outputs):
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    if 'masked_lm_accuracy' in metrics:
+      metrics['masked_lm_accuracy'].update_state(labels['masked_lm_ids'],
+                                                 model_outputs['lm_outputs'],
+                                                 labels['masked_lm_weights'])
+    if 'next_sentence_accuracy' in metrics:
+      metrics['next_sentence_accuracy'].update_state(
+          labels['next_sentence_labels'], model_outputs['sentence_outputs'])
+    if 'discriminator_accuracy' in metrics:
+      disc_logits_expanded = tf.expand_dims(model_outputs['disc_logits'], -1)
+      discrim_full_logits = tf.concat(
+          [-1.0 * disc_logits_expanded, disc_logits_expanded], -1)
+      metrics['discriminator_accuracy'].update_state(
+          model_outputs['disc_label'], discrim_full_logits,
+          labels['input_mask'])
+  def train_step(self, inputs, model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer, metrics):
+    """Does forward and backward.
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+    Returns:
+      A dictionary of logs.
+    """
+    with tf.GradientTape() as tape:
+      outputs = model(inputs, training=True)
+      # Computes per-replica loss.
+      loss = self.build_losses(
+          labels=inputs,
+          model_outputs=outputs,
+          metrics=metrics,
+          aux_losses=model.losses)
+      # Scales loss as the default gradients allreduce performs sum inside the
+      # optimizer.
+      # TODO(b/154564893): enable loss scaling.
+      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
+  def validation_step(self, inputs, model: tf.keras.Model, metrics):
+    """Validatation step.
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+    Returns:
+      A dictionary of logs.
+    """
+    outputs = model(inputs, training=False)
+    loss = self.build_losses(
+        labels=inputs,
+        model_outputs=outputs,
+        metrics=metrics,
+        aux_losses=model.losses)
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
--- a/research/compression/entropy_coder/lib/blocks_entropy_coding_test.py
+++ b/research/compression/entropy_coder/lib/blocks_entropy_coding_test.py
-# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,45 +13,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Tests for official.nlp.tasks.electra_task."""
-"""Tests for basic tensorflow blocks_entropy_coding."""
-from __future__ import division
-from __future__ import unicode_literals
-import math
-import numpy as np
 import tensorflow as tf
-import blocks_entropy_coding
+from official.nlp.configs import bert
+from official.nlp.configs import electra
+from official.nlp.configs import encoders
-class BlocksEntropyCodingTest(tf.test.TestCase):
+from official.nlp.data import pretrain_dataloader
+from official.nlp.tasks import electra_task
-  def testCodeLength(self):
-    shape = [2, 4]
-    proba_feed = [[0.65, 0.25, 0.70, 0.10],
+class ELECTRAPretrainTaskTest(tf.test.TestCase):
-                  [0.28, 0.20, 0.44, 0.54]]
-    symbol_feed = [[1.0, 0.0, 1.0, 0.0],
+  def test_task(self):
-                   [0.0, 0.0, 0.0, 1.0]]
+    config = electra_task.ELECTRAPretrainConfig(
-    mean_code_length = - (
+        model=electra.ELECTRAPretrainerConfig(
-        (math.log(0.65) + math.log(0.75) + math.log(0.70) + math.log(0.90) +
+            generator_encoder=encoders.TransformerEncoderConfig(
-         math.log(0.72) + math.log(0.80) + math.log(0.56) + math.log(0.54)) /
+                vocab_size=30522, num_layers=1),
-        math.log(2.0)) / (shape[0] * shape[1])
+            discriminator_encoder=encoders.TransformerEncoderConfig(
+                vocab_size=30522, num_layers=1),
-    symbol = tf.placeholder(dtype=tf.float32, shape=shape)
+            num_masked_tokens=20,
-    proba = tf.placeholder(dtype=tf.float32, shape=shape)
+            sequence_length=128,
-    code_length_calculator = blocks_entropy_coding.CodeLength()
+            cls_heads=[
-    code_length = code_length_calculator(symbol, proba)
+                bert.ClsHeadConfig(
+                    inner_dim=10, num_classes=2, name="next_sentence")
-    with self.test_session():
+            ]),
-      tf.global_variables_initializer().run()
+        train_data=pretrain_dataloader.BertPretrainDataConfig(
-      code_length_eval = code_length.eval(
+            input_path="dummy",
-          feed_dict={symbol: symbol_feed, proba: proba_feed})
+            max_predictions_per_seq=20,
+            seq_length=128,
-    self.assertAllClose(mean_code_length, code_length_eval)
+            global_batch_size=1))
+    task = electra_task.ELECTRAPretrainTask(config)
+    model = task.build_model()
-if __name__ == '__main__':
+    metrics = task.build_metrics()
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tasks/masked_lm.py
+++ b/official/nlp/tasks/masked_lm.py
@@ -18,16 +18,16 @@ import dataclasses
 import tensorflow as tf
 from official.core import base_task
+from official.core import task_factory
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import bert
-from official.nlp.data import pretrain_dataloader
+from official.nlp.data import data_loader_factory
-from official.nlp.modeling import losses as loss_lib
 @dataclasses.dataclass
 class MaskedLMConfig(cfg.TaskConfig):
  """The model config."""
-  network: bert.BertPretrainerConfig = bert.BertPretrainerConfig(cls_heads=[
+  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(cls_heads=[
      bert.ClsHeadConfig(
          inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence')
  ])
@@ -35,12 +35,13 @@ class MaskedLMConfig(cfg.TaskConfig):
  validation_data: cfg.DataConfig = cfg.DataConfig()
-@base_task.register_task_cls(MaskedLMConfig)
+@task_factory.register_task_cls(MaskedLMConfig)
 class MaskedLMTask(base_task.Task):
  """Mock task object for testing."""
-  def build_model(self):
+  def build_model(self, params=None):
-    return bert.instantiate_from_cfg(self.task_config.network)
+    params = params or self.task_config.model
+    return bert.instantiate_pretrainer_from_cfg(params)
  def build_losses(self,
                   labels,
@@ -48,23 +49,23 @@ class MaskedLMTask(base_task.Task):
                   metrics,
                   aux_losses=None) -> tf.Tensor:
    metrics = dict([(metric.name, metric) for metric in metrics])
-    lm_output = tf.nn.log_softmax(model_outputs['lm_output'], axis=-1)
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
-    mlm_loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
+        labels['masked_lm_ids'],
-        labels=labels['masked_lm_ids'],
+        tf.cast(model_outputs['lm_output'], tf.float32),
-        predictions=lm_output,
+        from_logits=True)
-        weights=labels['masked_lm_weights'])
+    lm_label_weights = labels['masked_lm_weights']
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
    metrics['lm_example_loss'].update_state(mlm_loss)
    if 'next_sentence_labels' in labels:
-      policy = tf.keras.mixed_precision.experimental.global_policy()
-      if policy.name == 'mixed_bfloat16':  # b/158514794: bf16 is not stable.
-        policy = tf.float32
-      predictions = tf.keras.layers.Activation(
-          tf.nn.log_softmax, dtype=policy)(model_outputs['next_sentence'])
      sentence_labels = labels['next_sentence_labels']
-      sentence_loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
+      sentence_outputs = tf.cast(
-          labels=sentence_labels,
+          model_outputs['next_sentence'], dtype=tf.float32)
-          predictions=predictions)
+      sentence_loss = tf.reduce_mean(
+          tf.keras.losses.sparse_categorical_crossentropy(sentence_labels,
+                                                          sentence_outputs,
+                                                          from_logits=True))
      metrics['next_sentence_loss'].update_state(sentence_loss)
      total_loss = mlm_loss + sentence_loss
    else:
@@ -77,6 +78,7 @@ class MaskedLMTask(base_task.Task):
  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for pretraining."""
    if params.input_path == 'dummy':
      def dummy_data(_):
        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
@@ -95,8 +97,7 @@ class MaskedLMTask(base_task.Task):
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset
-    return pretrain_dataloader.BertPretrainDataLoader(params).load(
+    return data_loader_factory.get_data_loader(params).load(input_context)
-        input_context)
  def build_metrics(self, training=None):
    del training

--- a/official/nlp/tasks/masked_lm_test.py
+++ b/official/nlp/tasks/masked_lm_test.py
@@ -19,6 +19,7 @@ import tensorflow as tf
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
 from official.nlp.tasks import masked_lm
@@ -26,14 +27,14 @@ class MLMTaskTest(tf.test.TestCase):
  def test_task(self):
    config = masked_lm.MaskedLMConfig(
-        network=bert.BertPretrainerConfig(
+        init_checkpoint=self.get_temp_dir(),
+        model=bert.BertPretrainerConfig(
            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
-            num_masked_tokens=20,
            cls_heads=[
                bert.ClsHeadConfig(
                    inner_dim=10, num_classes=2, name="next_sentence")
            ]),
-        train_data=bert.BertPretrainDataConfig(
+        train_data=pretrain_dataloader.BertPretrainDataConfig(
            input_path="dummy",
            max_predictions_per_seq=20,
            seq_length=128,
@@ -48,6 +49,12 @@ class MLMTaskTest(tf.test.TestCase):
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)
+    # Saves a checkpoint.
+    ckpt = tf.train.Checkpoint(
+        model=model, **model.checkpoint_items)
+    ckpt.save(config.init_checkpoint)
+    task.initialize(model)
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tasks/question_answering.py
+++ b/official/nlp/tasks/question_answering.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Question answering task."""
+import collections
+import json
+import os
+from absl import logging
+import dataclasses
+import tensorflow as tf
+import tensorflow_hub as hub
+from official.core import base_task
+from official.core import task_factory
+from official.modeling.hyperparams import base_config
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.bert import squad_evaluate_v1_1
+from official.nlp.bert import squad_evaluate_v2_0
+from official.nlp.bert import tokenization
+from official.nlp.configs import encoders
+from official.nlp.data import data_loader_factory
+from official.nlp.data import squad_lib as squad_lib_wp
+from official.nlp.data import squad_lib_sp
+from official.nlp.modeling import models
+from official.nlp.tasks import utils
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A base span labeler configuration."""
+  encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+@dataclasses.dataclass
+class QuestionAnsweringConfig(cfg.TaskConfig):
+  """The model config."""
+  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
+  init_checkpoint: str = ''
+  hub_module_url: str = ''
+  n_best_size: int = 20
+  max_answer_length: int = 30
+  null_score_diff_threshold: float = 0.0
+  model: ModelConfig = ModelConfig()
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+@task_factory.register_task_cls(QuestionAnsweringConfig)
+class QuestionAnsweringTask(base_task.Task):
+  """Task object for question answering."""
+  def __init__(self, params=cfg.TaskConfig, logging_dir=None):
+    super(QuestionAnsweringTask, self).__init__(params, logging_dir)
+    if params.hub_module_url and params.init_checkpoint:
+      raise ValueError('At most one of `hub_module_url` and '
+                       '`init_checkpoint` can be specified.')
+    if params.hub_module_url:
+      self._hub_module = hub.load(params.hub_module_url)
+    else:
+      self._hub_module = None
+    if params.validation_data.tokenization == 'WordPiece':
+      self.squad_lib = squad_lib_wp
+    elif params.validation_data.tokenization == 'SentencePiece':
+      self.squad_lib = squad_lib_sp
+    else:
+      raise ValueError('Unsupported tokenization method: {}'.format(
+          params.validation_data.tokenization))
+    if params.validation_data.input_path:
+      self._tf_record_input_path, self._eval_examples, self._eval_features = (
+          self._preprocess_eval_data(params.validation_data))
+  def build_model(self):
+    if self._hub_module:
+      encoder_network = utils.get_encoder_from_hub(self._hub_module)
+    else:
+      encoder_network = encoders.instantiate_encoder_from_cfg(
+          self.task_config.model.encoder)
+    # Currently, we only supports bert-style question answering finetuning.
+    return models.BertSpanLabeler(
+        network=encoder_network,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=self.task_config.model.encoder.initializer_range))
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    start_positions = labels['start_positions']
+    end_positions = labels['end_positions']
+    start_logits, end_logits = model_outputs
+    start_loss = tf.keras.losses.sparse_categorical_crossentropy(
+        start_positions,
+        tf.cast(start_logits, dtype=tf.float32),
+        from_logits=True)
+    end_loss = tf.keras.losses.sparse_categorical_crossentropy(
+        end_positions,
+        tf.cast(end_logits, dtype=tf.float32),
+        from_logits=True)
+    loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
+    return loss
+  def _preprocess_eval_data(self, params):
+    eval_examples = self.squad_lib.read_squad_examples(
+        input_file=params.input_path,
+        is_training=False,
+        version_2_with_negative=params.version_2_with_negative)
+    temp_file_path = params.input_preprocessed_data_path or self.logging_dir
+    if not temp_file_path:
+      raise ValueError('You must specify a temporary directory, either in '
+                       'params.input_preprocessed_data_path or logging_dir to '
+                       'store intermediate evaluation TFRecord data.')
+    eval_writer = self.squad_lib.FeatureWriter(
+        filename=os.path.join(temp_file_path, 'eval.tf_record'),
+        is_training=False)
+    eval_features = []
+    def _append_feature(feature, is_padding):
+      if not is_padding:
+        eval_features.append(feature)
+      eval_writer.process_feature(feature)
+    kwargs = dict(
+        examples=eval_examples,
+        tokenizer=tokenization.FullTokenizer(
+            vocab_file=params.vocab_file,
+            do_lower_case=params.do_lower_case),
+        max_seq_length=params.seq_length,
+        doc_stride=params.doc_stride,
+        max_query_length=params.query_length,
+        is_training=False,
+        output_fn=_append_feature,
+        batch_size=params.global_batch_size)
+    if params.tokenization == 'SentencePiece':
+      # squad_lib_sp requires one more argument 'do_lower_case'.
+      kwargs['do_lower_case'] = params.do_lower_case
+    eval_dataset_size = self.squad_lib.convert_examples_to_features(**kwargs)
+    eval_writer.close()
+    logging.info('***** Evaluation input stats *****')
+    logging.info('  Num orig examples = %d', len(eval_examples))
+    logging.info('  Num split examples = %d', len(eval_features))
+    logging.info('  Batch size = %d', params.global_batch_size)
+    logging.info('  Dataset size = %d', eval_dataset_size)
+    return eval_writer.filename, eval_examples, eval_features
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for sentence_prediction task."""
+    if params.input_path == 'dummy':
+      # Dummy training data for unit test.
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        x = dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids)
+        y = dict(
+            start_positions=tf.constant(0, dtype=tf.int32),
+            end_positions=tf.constant(1, dtype=tf.int32))
+        return (x, y)
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+    if params.is_training:
+      dataloader_params = params
+    else:
+      input_path = self._tf_record_input_path
+      dataloader_params = params.replace(input_path=input_path)
+    return data_loader_factory.get_data_loader(
+        dataloader_params).load(input_context)
+  def build_metrics(self, training=None):
+    del training
+    # TODO(lehou): a list of metrics doesn't work the same as in compile/fit.
+    metrics = [
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='start_position_accuracy'),
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='end_position_accuracy'),
+    ]
+    return metrics
+  def process_metrics(self, metrics, labels, model_outputs):
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    start_logits, end_logits = model_outputs
+    metrics['start_position_accuracy'].update_state(
+        labels['start_positions'], start_logits)
+    metrics['end_position_accuracy'].update_state(
+        labels['end_positions'], end_logits)
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    start_logits, end_logits = model_outputs
+    compiled_metrics.update_state(
+        y_true=labels,  # labels has keys 'start_positions' and 'end_positions'.
+        y_pred={'start_positions': start_logits, 'end_positions': end_logits})
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    features, _ = inputs
+    unique_ids = features.pop('unique_ids')
+    model_outputs = self.inference_step(features, model)
+    start_logits, end_logits = model_outputs
+    logs = {
+        self.loss: 0.0,  # TODO(lehou): compute the real validation loss.
+        'unique_ids': unique_ids,
+        'start_logits': start_logits,
+        'end_logits': end_logits,
+    }
+    return logs
+  raw_aggregated_result = collections.namedtuple(
+      'RawResult', ['unique_id', 'start_logits', 'end_logits'])
+  def aggregate_logs(self, state=None, step_outputs=None):
+    assert step_outputs is not None, 'Got no logs from self.validation_step.'
+    if state is None:
+      state = []
+    for unique_ids, start_logits, end_logits in zip(
+        step_outputs['unique_ids'],
+        step_outputs['start_logits'],
+        step_outputs['end_logits']):
+      u_ids, s_logits, e_logits = (
+          unique_ids.numpy(), start_logits.numpy(), end_logits.numpy())
+      if u_ids.size == 1:
+        u_ids = [u_ids]
+        s_logits = [s_logits]
+        e_logits = [e_logits]
+      for values in zip(u_ids, s_logits, e_logits):
+        state.append(self.raw_aggregated_result(
+            unique_id=values[0],
+            start_logits=values[1].tolist(),
+            end_logits=values[2].tolist()))
+    return state
+  def reduce_aggregated_logs(self, aggregated_logs):
+    all_predictions, _, scores_diff = (
+        self.squad_lib.postprocess_output(
+            self._eval_examples,
+            self._eval_features,
+            aggregated_logs,
+            self.task_config.n_best_size,
+            self.task_config.max_answer_length,
+            self.task_config.validation_data.do_lower_case,
+            version_2_with_negative=(
+                self.task_config.validation_data.version_2_with_negative),
+            null_score_diff_threshold=(
+                self.task_config.null_score_diff_threshold),
+            verbose=False))
+    with tf.io.gfile.GFile(
+        self.task_config.validation_data.input_path, 'r') as reader:
+      dataset_json = json.load(reader)
+      pred_dataset = dataset_json['data']
+    if self.task_config.validation_data.version_2_with_negative:
+      eval_metrics = squad_evaluate_v2_0.evaluate(
+          pred_dataset, all_predictions, scores_diff)
+      # Filter out useless metrics, such as start_position_accuracy that
+      # we did not actually compute.
+      eval_metrics = {
+          'exact_match': eval_metrics['final_exact'],
+          'exact_match_threshold': eval_metrics['final_exact_thresh'],
+          'final_f1': eval_metrics['final_f1'] / 100.0,  # scale back to [0, 1].
+          'f1_threshold': eval_metrics['final_f1_thresh'],
+          'has_answer_exact_match': eval_metrics['HasAns_exact'],
+          'has_answer_f1': eval_metrics['HasAns_f1']}
+    else:
+      eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
+      # Filter out useless metrics, such as start_position_accuracy that
+      # we did not actually compute.
+      eval_metrics = {'exact_match': eval_metrics['exact_match'],
+                      'final_f1': eval_metrics['final_f1']}
+    return eval_metrics
--- a/official/nlp/tasks/question_answering_test.py
+++ b/official/nlp/tasks/question_answering_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.nlp.tasks.question_answering."""
+import itertools
+import json
+import os
+from absl.testing import parameterized
+import tensorflow as tf
+from official.nlp.bert import configs
+from official.nlp.bert import export_tfhub
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.data import question_answering_dataloader
+from official.nlp.tasks import question_answering
+class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    super(QuestionAnsweringTaskTest, self).setUp()
+    self._encoder_config = encoders.TransformerEncoderConfig(
+        vocab_size=30522, num_layers=1)
+    self._train_data_config = question_answering_dataloader.QADataConfig(
+        input_path="dummy",
+        seq_length=128,
+        global_batch_size=1)
+    val_data = {"version": "1.1",
+                "data": [{"paragraphs": [
+                    {"context": "Sky is blue.",
+                     "qas": [{"question": "What is blue?", "id": "1234",
+                              "answers": [{"text": "Sky", "answer_start": 0},
+                                          {"text": "Sky", "answer_start": 0},
+                                          {"text": "Sky", "answer_start": 0}]
+                              }]}]}]}
+    self._val_input_path = os.path.join(self.get_temp_dir(), "val_data.json")
+    with tf.io.gfile.GFile(self._val_input_path, "w") as writer:
+      writer.write(json.dumps(val_data, indent=4) + "\n")
+    self._test_vocab = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with tf.io.gfile.GFile(self._test_vocab, "w") as writer:
+      writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")
+  def _get_validation_data_config(self, version_2_with_negative=False):
+    return question_answering_dataloader.QADataConfig(
+        is_training=False,
+        input_path=self._val_input_path,
+        input_preprocessed_data_path=self.get_temp_dir(),
+        seq_length=128,
+        global_batch_size=1,
+        version_2_with_negative=version_2_with_negative,
+        vocab_file=self._test_vocab,
+        tokenization="WordPiece",
+        do_lower_case=True)
+  def _run_task(self, config):
+    task = question_answering.QuestionAnsweringTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    task.initialize(model)
+    train_dataset = task.build_inputs(config.train_data)
+    train_iterator = iter(train_dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(train_iterator), model, optimizer, metrics=metrics)
+    val_dataset = task.build_inputs(config.validation_data)
+    val_iterator = iter(val_dataset)
+    logs = task.validation_step(next(val_iterator), model, metrics=metrics)
+    logs = task.aggregate_logs(step_outputs=logs)
+    metrics = task.reduce_aggregated_logs(logs)
+    self.assertIn("final_f1", metrics)
+  @parameterized.parameters(itertools.product(
+      (False, True),
+      ("WordPiece", "SentencePiece"),
+  ))
+  def test_task(self, version_2_with_negative, tokenization):
+    # Saves a checkpoint.
+    pretrain_cfg = bert.BertPretrainerConfig(
+        encoder=self._encoder_config,
+        cls_heads=[
+            bert.ClsHeadConfig(
+                inner_dim=10, num_classes=3, name="next_sentence")
+        ])
+    pretrain_model = bert.instantiate_pretrainer_from_cfg(pretrain_cfg)
+    ckpt = tf.train.Checkpoint(
+        model=pretrain_model, **pretrain_model.checkpoint_items)
+    saved_path = ckpt.save(self.get_temp_dir())
+    config = question_answering.QuestionAnsweringConfig(
+        init_checkpoint=saved_path,
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config(
+            version_2_with_negative))
+    self._run_task(config)
+  def test_task_with_fit(self):
+    config = question_answering.QuestionAnsweringConfig(
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config())
+    task = question_answering.QuestionAnsweringTask(config)
+    model = task.build_model()
+    model = task.compile_model(
+        model,
+        optimizer=tf.keras.optimizers.SGD(lr=0.1),
+        train_step=task.train_step,
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
+    dataset = task.build_inputs(config.train_data)
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn("loss", logs.history)
+    self.assertIn("start_positions_accuracy", logs.history)
+    self.assertIn("end_positions_accuracy", logs.history)
+  def _export_bert_tfhub(self):
+    bert_config = configs.BertConfig(
+        vocab_size=30522,
+        hidden_size=16,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=1)
+    _, encoder = export_tfhub.create_bert_model(bert_config)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(model=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+    vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt")
+    with tf.io.gfile.GFile(vocab_file, "w") as f:
+      f.write("dummy content")
+    hub_destination = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path,
+                                   hub_destination, vocab_file)
+    return hub_destination
+  def test_task_with_hub(self):
+    hub_module_url = self._export_bert_tfhub()
+    config = question_answering.QuestionAnsweringConfig(
+        hub_module_url=hub_module_url,
+        model=question_answering.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config())
+    self._run_task(config)
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -14,16 +14,38 @@
 # limitations under the License.
 # ==============================================================================
 """Sentence prediction (classification) task."""
-import logging
+from typing import List, Union
+from absl import logging
 import dataclasses
+import numpy as np
+import orbit
+from scipy import stats
+from sklearn import metrics as sklearn_metrics
 import tensorflow as tf
 import tensorflow_hub as hub
 from official.core import base_task
+from official.core import task_factory
+from official.modeling.hyperparams import base_config
 from official.modeling.hyperparams import config_definitions as cfg
-from official.nlp.configs import bert
+from official.nlp.configs import encoders
-from official.nlp.data import sentence_prediction_dataloader
+from official.nlp.data import data_loader_factory
-from official.nlp.modeling import losses as loss_lib
+from official.nlp.modeling import models
+from official.nlp.tasks import utils
+METRIC_TYPES = frozenset(
+    ['accuracy', 'matthews_corrcoef', 'pearson_spearman_corr'])
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A classifier/regressor configuration."""
+  num_classes: int = 0
+  use_encoder_pooler: bool = False
+  encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
 @dataclasses.dataclass
@@ -32,62 +54,58 @@ class SentencePredictionConfig(cfg.TaskConfig):
  # At most one of `init_checkpoint` and `hub_module_url` can
  # be specified.
  init_checkpoint: str = ''
+  init_cls_pooler: bool = False
  hub_module_url: str = ''
-  network: bert.BertPretrainerConfig = bert.BertPretrainerConfig(
+  metric_type: str = 'accuracy'
-      num_masked_tokens=0,
+  # Defines the concrete model config at instantiation time.
-      cls_heads=[
+  model: ModelConfig = ModelConfig()
-          bert.ClsHeadConfig(
-              inner_dim=768,
-              num_classes=3,
-              dropout_rate=0.1,
-              name='sentence_prediction')
-      ])
  train_data: cfg.DataConfig = cfg.DataConfig()
  validation_data: cfg.DataConfig = cfg.DataConfig()
-@base_task.register_task_cls(SentencePredictionConfig)
+@task_factory.register_task_cls(SentencePredictionConfig)
 class SentencePredictionTask(base_task.Task):
  """Task object for sentence_prediction."""
-  def __init__(self, params=cfg.TaskConfig):
+  def __init__(self, params=cfg.TaskConfig, logging_dir=None):
-    super(SentencePredictionTask, self).__init__(params)
+    super(SentencePredictionTask, self).__init__(params, logging_dir)
    if params.hub_module_url and params.init_checkpoint:
      raise ValueError('At most one of `hub_module_url` and '
-                       '`pretrain_checkpoint_dir` can be specified.')
+                       '`init_checkpoint` can be specified.')
    if params.hub_module_url:
      self._hub_module = hub.load(params.hub_module_url)
    else:
      self._hub_module = None
+    if params.metric_type not in METRIC_TYPES:
+      raise ValueError('Invalid metric_type: {}'.format(params.metric_type))
+    self.metric_type = params.metric_type
  def build_model(self):
    if self._hub_module:
-      input_word_ids = tf.keras.layers.Input(
+      encoder_network = utils.get_encoder_from_hub(self._hub_module)
-          shape=(None,), dtype=tf.int32, name='input_word_ids')
-      input_mask = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_mask')
-      input_type_ids = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_type_ids')
-      bert_model = hub.KerasLayer(self._hub_module, trainable=True)
-      pooled_output, sequence_output = bert_model(
-          [input_word_ids, input_mask, input_type_ids])
-      encoder_from_hub = tf.keras.Model(
-          inputs=[input_word_ids, input_mask, input_type_ids],
-          outputs=[sequence_output, pooled_output])
-      return bert.instantiate_from_cfg(
-          self.task_config.network, encoder_network=encoder_from_hub)
    else:
-      return bert.instantiate_from_cfg(self.task_config.network)
+      encoder_network = encoders.instantiate_encoder_from_cfg(
+          self.task_config.model.encoder)
+    # Currently, we only support bert-style sentence prediction finetuning.
+    return models.BertClassifier(
+        network=encoder_network,
+        num_classes=self.task_config.model.num_classes,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=self.task_config.model.encoder.initializer_range),
+        use_encoder_pooler=self.task_config.model.use_encoder_pooler)
  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
+    if self.task_config.model.num_classes == 1:
-        labels=labels,
+      loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
-        predictions=tf.nn.log_softmax(
+    else:
-            model_outputs['sentence_prediction'], axis=-1))
+      loss = tf.keras.losses.sparse_categorical_crossentropy(
+          labels, tf.cast(model_outputs, tf.float32), from_logits=True)
    if aux_losses:
      loss += tf.add_n(aux_losses)
-    return loss
+    return tf.reduce_mean(loss)
  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
@@ -99,8 +117,12 @@ class SentencePredictionTask(base_task.Task):
            input_word_ids=dummy_ids,
            input_mask=dummy_ids,
            input_type_ids=dummy_ids)
-        y = tf.ones((1, 1), dtype=tf.int32)
-        return (x, y)
+        if self.task_config.model.num_classes == 1:
+          y = tf.zeros((1,), dtype=tf.float32)
+        else:
+          y = tf.zeros((1, 1), dtype=tf.int32)
+        return x, y
      dataset = tf.data.Dataset.range(1)
      dataset = dataset.repeat()
@@ -108,20 +130,80 @@ class SentencePredictionTask(base_task.Task):
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset
-    return sentence_prediction_dataloader.SentencePredictionDataLoader(
+    return data_loader_factory.get_data_loader(params).load(input_context)
-        params).load(input_context)
  def build_metrics(self, training=None):
    del training
-    metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
+    if self.task_config.model.num_classes == 1:
+      metrics = [tf.keras.metrics.MeanSquaredError()]
+    else:
+      metrics = [
+          tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
    return metrics
  def process_metrics(self, metrics, labels, model_outputs):
    for metric in metrics:
-      metric.update_state(labels, model_outputs['sentence_prediction'])
+      metric.update_state(labels, model_outputs)
  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    compiled_metrics.update_state(labels, model_outputs['sentence_prediction'])
+    compiled_metrics.update_state(labels, model_outputs)
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    if self.metric_type == 'accuracy':
+      return super(SentencePredictionTask,
+                   self).validation_step(inputs, model, metrics)
+    features, labels = inputs
+    outputs = self.inference_step(features, model)
+    loss = self.build_losses(
+        labels=labels, model_outputs=outputs, aux_losses=model.losses)
+    logs = {self.loss: loss}
+    if self.metric_type == 'matthews_corrcoef':
+      logs.update({
+          'sentence_prediction':
+              tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=0),
+          'labels':
+              labels,
+      })
+    if self.metric_type == 'pearson_spearman_corr':
+      logs.update({
+          'sentence_prediction': outputs,
+          'labels': labels,
+      })
+    return logs
+  def aggregate_logs(self, state=None, step_outputs=None):
+    if self.metric_type == 'accuracy':
+      return None
+    if state is None:
+      state = {'sentence_prediction': [], 'labels': []}
+    # TODO(b/160712818): Add support for concatenating partial batches.
+    state['sentence_prediction'].append(
+        np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']],
+                       axis=0))
+    state['labels'].append(
+        np.concatenate([v.numpy() for v in step_outputs['labels']], axis=0))
+    return state
+  def reduce_aggregated_logs(self, aggregated_logs):
+    if self.metric_type == 'accuracy':
+      return None
+    elif self.metric_type == 'matthews_corrcoef':
+      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
+      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
+      return {
+          self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels)
+      }
+    elif self.metric_type == 'pearson_spearman_corr':
+      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
+      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
+      pearson_corr = stats.pearsonr(preds, labels)[0]
+      spearman_corr = stats.spearmanr(preds, labels)[0]
+      corr_metric = (pearson_corr + spearman_corr) / 2
+      return {self.metric_type: corr_metric}
  def initialize(self, model):
    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
@@ -132,13 +214,65 @@ class SentencePredictionTask(base_task.Task):
      return
    pretrain2finetune_mapping = {
-        'encoder':
+        'encoder': model.checkpoint_items['encoder'],
-            model.checkpoint_items['encoder'],
-        'next_sentence.pooler_dense':
-            model.checkpoint_items['sentence_prediction.pooler_dense'],
    }
+    # TODO(b/160251903): Investigate why no pooler dense improves finetuning
+    # accuracies.
+    if self.task_config.init_cls_pooler:
+      pretrain2finetune_mapping[
+          'next_sentence.pooler_dense'] = model.checkpoint_items[
+              'sentence_prediction.pooler_dense']
    ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping)
-    status = ckpt.restore(ckpt_dir_or_file)
+    status = ckpt.read(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
-    logging.info('finished loading pretrained checkpoint from %s',
+    logging.info('Finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
+def predict(task: SentencePredictionTask, params: cfg.DataConfig,
+            model: tf.keras.Model) -> List[Union[int, float]]:
+  """Predicts on the input data.
+  Args:
+    task: A `SentencePredictionTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+  Returns:
+    A list of predictions with length of `num_examples`. For regression task,
+      each element in the list is the predicted score; for classification task,
+      each element is the predicted class id.
+  """
+  is_regression = task.task_config.model.num_classes == 1
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+    def _replicated_step(inputs):
+      """Replicated prediction calculation."""
+      x, _ = inputs
+      outputs = task.inference_step(x, model)
+      if is_regression:
+        return outputs
+      else:
+        return tf.argmax(outputs, axis=-1)
+    outputs = tf.distribute.get_strategy().run(
+        _replicated_step, args=(next(iterator),))
+    return tf.nest.map_structure(
+        tf.distribute.get_strategy().experimental_local_results, outputs)
+  def reduce_fn(state, outputs):
+    """Concatenates model's outputs."""
+    for per_replica_batch_predictions in outputs:
+      state.extend(per_replica_batch_predictions)
+    return state
+  loop_fn = orbit.utils.create_loop_fn(predict_step)
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  # Set `num_steps` to -1 to exhaust the dataset.
+  predictions = loop_fn(
+      iter(dataset), num_steps=-1, state=[], reduce_fn=reduce_fn)
+  return predictions
--- a/official/nlp/tasks/sentence_prediction_test.py
+++ b/official/nlp/tasks/sentence_prediction_test.py
@@ -16,16 +16,61 @@
 """Tests for official.nlp.tasks.sentence_prediction."""
 import functools
 import os
+from absl.testing import parameterized
+import numpy as np
 import tensorflow as tf
 from official.nlp.bert import configs
 from official.nlp.bert import export_tfhub
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
+from official.nlp.data import sentence_prediction_dataloader
 from official.nlp.tasks import sentence_prediction
-class SentencePredictionTaskTest(tf.test.TestCase):
+def _create_fake_dataset(output_path, seq_length, num_classes, num_examples):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+  def create_int_feature(values):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  def create_float_feature(values):
+    return tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+  for _ in range(num_examples):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+    if num_classes == 1:
+      features["label_ids"] = create_float_feature([np.random.random()])
+    else:
+      features["label_ids"] = create_int_feature(
+          [np.random.random_integers(0, num_classes - 1, size=())])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    super(SentencePredictionTaskTest, self).setUp()
+    self._train_data_config = (
+        sentence_prediction_dataloader.SentencePredictionDataConfig(
+            input_path="dummy", seq_length=128, global_batch_size=1))
+  def get_model_config(self, num_classes):
+    return sentence_prediction.ModelConfig(
+        encoder=encoders.TransformerEncoderConfig(
+            vocab_size=30522, num_layers=1),
+        num_classes=num_classes)
  def _run_task(self, config):
    task = sentence_prediction.SentencePredictionTask(config)
@@ -44,16 +89,8 @@ class SentencePredictionTaskTest(tf.test.TestCase):
  def test_task(self):
    config = sentence_prediction.SentencePredictionConfig(
        init_checkpoint=self.get_temp_dir(),
-        network=bert.BertPretrainerConfig(
+        model=self.get_model_config(2),
-            encoder=encoders.TransformerEncoderConfig(
+        train_data=self._train_data_config)
-                vocab_size=30522, num_layers=1),
-            num_masked_tokens=0,
-            cls_heads=[
-                bert.ClsHeadConfig(
-                    inner_dim=10, num_classes=3, name="sentence_prediction")
-            ]),
-        train_data=bert.BertSentencePredictionDataConfig(
-            input_path="dummy", seq_length=128, global_batch_size=1))
    task = sentence_prediction.SentencePredictionTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
@@ -68,17 +105,89 @@ class SentencePredictionTaskTest(tf.test.TestCase):
    pretrain_cfg = bert.BertPretrainerConfig(
        encoder=encoders.TransformerEncoderConfig(
            vocab_size=30522, num_layers=1),
-        num_masked_tokens=20,
        cls_heads=[
            bert.ClsHeadConfig(
                inner_dim=10, num_classes=3, name="next_sentence")
        ])
-    pretrain_model = bert.instantiate_from_cfg(pretrain_cfg)
+    pretrain_model = bert.instantiate_pretrainer_from_cfg(pretrain_cfg)
    ckpt = tf.train.Checkpoint(
        model=pretrain_model, **pretrain_model.checkpoint_items)
    ckpt.save(config.init_checkpoint)
    task.initialize(model)
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "regression",
+          "num_classes": 1,
+      },
+      {
+          "testcase_name": "classification",
+          "num_classes": 2,
+      },
+  )
+  def test_metrics_and_losses(self, num_classes):
+    config = sentence_prediction.SentencePredictionConfig(
+        init_checkpoint=self.get_temp_dir(),
+        model=self.get_model_config(num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    if num_classes == 1:
+      self.assertIsInstance(metrics[0], tf.keras.metrics.MeanSquaredError)
+    else:
+      self.assertIsInstance(
+          metrics[0], tf.keras.metrics.SparseCategoricalAccuracy)
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    logs = task.validation_step(next(iterator), model, metrics=metrics)
+    loss = logs["loss"].numpy()
+    if num_classes == 1:
+      self.assertAlmostEqual(loss, 42.77483, places=3)
+    else:
+      self.assertAlmostEqual(loss, 3.57627e-6, places=3)
+  @parameterized.parameters(("matthews_corrcoef", 2),
+                            ("pearson_spearman_corr", 1))
+  def test_np_metrics(self, metric_type, num_classes):
+    config = sentence_prediction.SentencePredictionConfig(
+        metric_type=metric_type,
+        init_checkpoint=self.get_temp_dir(),
+        model=self.get_model_config(num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(config)
+    model = task.build_model()
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    strategy = tf.distribute.get_strategy()
+    distributed_outputs = strategy.run(
+        functools.partial(task.validation_step, model=model),
+        args=(next(iterator),))
+    outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                    distributed_outputs)
+    aggregated = task.aggregate_logs(step_outputs=outputs)
+    aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
+    self.assertIn(metric_type, task.reduce_aggregated_logs(aggregated))
+  def test_task_with_fit(self):
+    config = sentence_prediction.SentencePredictionConfig(
+        model=self.get_model_config(2), train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(config)
+    model = task.build_model()
+    model = task.compile_model(
+        model,
+        optimizer=tf.keras.optimizers.SGD(lr=0.1),
+        train_step=task.train_step,
+        metrics=task.build_metrics())
+    dataset = task.build_inputs(config.train_data)
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn("loss", logs.history)
  def _export_bert_tfhub(self):
    bert_config = configs.BertConfig(
        vocab_size=30522,
@@ -106,17 +215,39 @@ class SentencePredictionTaskTest(tf.test.TestCase):
    hub_module_url = self._export_bert_tfhub()
    config = sentence_prediction.SentencePredictionConfig(
        hub_module_url=hub_module_url,
-        network=bert.BertPretrainerConfig(
+        model=self.get_model_config(2),
-            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
+        train_data=self._train_data_config)
-            num_masked_tokens=0,
-            cls_heads=[
-                bert.ClsHeadConfig(
-                    inner_dim=10, num_classes=3, name="sentence_prediction")
-            ]),
-        train_data=bert.BertSentencePredictionDataConfig(
-            input_path="dummy", seq_length=128, global_batch_size=10))
    self._run_task(config)
+  @parameterized.named_parameters(("classification", 5), ("regression", 1))
+  def test_prediction(self, num_classes):
+    task_config = sentence_prediction.SentencePredictionConfig(
+        model=self.get_model_config(num_classes=num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(task_config)
+    model = task.build_model()
+    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
+    seq_length = 16
+    num_examples = 100
+    _create_fake_dataset(
+        test_data_path,
+        seq_length=seq_length,
+        num_classes=num_classes,
+        num_examples=num_examples)
+    test_data_config = (
+        sentence_prediction_dataloader.SentencePredictionDataConfig(
+            input_path=test_data_path,
+            seq_length=seq_length,
+            is_training=False,
+            label_type="int" if num_classes > 1 else "float",
+            global_batch_size=16,
+            drop_remainder=False))
+    predictions = sentence_prediction.predict(task, test_data_config, model)
+    self.assertLen(predictions, num_examples)
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tasks/tagging.py
+++ b/official/nlp/tasks/tagging.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tagging (e.g., NER/POS) task."""
+from typing import List, Optional, Tuple
+import dataclasses
+import orbit
+from seqeval import metrics as seqeval_metrics
+import tensorflow as tf
+import tensorflow_hub as hub
+from official.core import base_task
+from official.core import task_factory
+from official.modeling.hyperparams import base_config
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.configs import encoders
+from official.nlp.data import data_loader_factory
+from official.nlp.modeling import models
+from official.nlp.tasks import utils
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A base span labeler configuration."""
+  encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+  head_dropout: float = 0.1
+  head_initializer_range: float = 0.02
+@dataclasses.dataclass
+class TaggingConfig(cfg.TaskConfig):
+  """The model config."""
+  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
+  init_checkpoint: str = ''
+  hub_module_url: str = ''
+  model: ModelConfig = ModelConfig()
+  # The real class names, the order of which should match real label id.
+  # Note that a word may be tokenized into multiple word_pieces tokens, and
+  # we asssume the real label id (non-negative) is assigned to the first token
+  # of the word, and a negative label id is assigned to the remaining tokens.
+  # The negative label id will not contribute to loss and metrics.
+  class_names: Optional[List[str]] = None
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+def _masked_labels_and_weights(y_true):
+  """Masks negative values from token level labels.
+  Args:
+    y_true: Token labels, typically shape (batch_size, seq_len), where tokens
+      with negative labels should be ignored during loss/accuracy calculation.
+  Returns:
+    (masked_y_true, masked_weights) where `masked_y_true` is the input
+    with each negative label replaced with zero and `masked_weights` is 0.0
+    where negative labels were replaced and 1.0 for original labels.
+  """
+  # Ignore the classes of tokens with negative values.
+  mask = tf.greater_equal(y_true, 0)
+  # Replace negative labels, which are out of bounds for some loss functions,
+  # with zero.
+  masked_y_true = tf.where(mask, y_true, 0)
+  return masked_y_true, tf.cast(mask, tf.float32)
+@task_factory.register_task_cls(TaggingConfig)
+class TaggingTask(base_task.Task):
+  """Task object for tagging (e.g., NER or POS)."""
+  def __init__(self, params=cfg.TaskConfig, logging_dir=None):
+    super(TaggingTask, self).__init__(params, logging_dir)
+    if params.hub_module_url and params.init_checkpoint:
+      raise ValueError('At most one of `hub_module_url` and '
+                       '`init_checkpoint` can be specified.')
+    if not params.class_names:
+      raise ValueError('TaggingConfig.class_names cannot be empty.')
+    if params.hub_module_url:
+      self._hub_module = hub.load(params.hub_module_url)
+    else:
+      self._hub_module = None
+  def build_model(self):
+    if self._hub_module:
+      encoder_network = utils.get_encoder_from_hub(self._hub_module)
+    else:
+      encoder_network = encoders.instantiate_encoder_from_cfg(
+          self.task_config.model.encoder)
+    return models.BertTokenClassifier(
+        network=encoder_network,
+        num_classes=len(self.task_config.class_names),
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=self.task_config.model.head_initializer_range),
+        dropout_rate=self.task_config.model.head_dropout,
+        output='logits')
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    model_outputs = tf.cast(model_outputs, tf.float32)
+    masked_labels, masked_weights = _masked_labels_and_weights(labels)
+    loss = tf.keras.losses.sparse_categorical_crossentropy(
+        masked_labels, model_outputs, from_logits=True)
+    numerator_loss = tf.reduce_sum(loss * masked_weights)
+    denominator_loss = tf.reduce_sum(masked_weights)
+    loss = tf.math.divide_no_nan(numerator_loss, denominator_loss)
+    return loss
+  def build_inputs(self, params: cfg.DataConfig, input_context=None):
+    """Returns tf.data.Dataset for sentence_prediction task."""
+    if params.input_path == 'dummy':
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        x = dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids)
+        # Include some label_id as -1, which will be ignored in loss/metrics.
+        y = tf.random.uniform(
+            shape=(1, params.seq_length),
+            minval=-1,
+            maxval=len(self.task_config.class_names),
+            dtype=tf.dtypes.int32)
+        return (x, y)
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+    return data_loader_factory.get_data_loader(params).load(input_context)
+  def inference_step(self, inputs, model: tf.keras.Model):
+    """Performs the forward step."""
+    logits = model(inputs, training=False)
+    return {'logits': logits, 'predict_ids': tf.argmax(logits, axis=-1)}
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    """Validatation step.
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+    outputs = self.inference_step(features, model)
+    loss = self.build_losses(labels=labels, model_outputs=outputs['logits'])
+    # Negative label ids are padding labels which should be ignored.
+    real_label_index = tf.where(tf.greater_equal(labels, 0))
+    predict_ids = tf.gather_nd(outputs['predict_ids'], real_label_index)
+    label_ids = tf.gather_nd(labels, real_label_index)
+    return {
+        self.loss: loss,
+        'predict_ids': predict_ids,
+        'label_ids': label_ids,
+    }
+  def aggregate_logs(self, state=None, step_outputs=None):
+    """Aggregates over logs returned from a validation step."""
+    if state is None:
+      state = {'predict_class': [], 'label_class': []}
+    def id_to_class_name(batched_ids):
+      class_names = []
+      for per_example_ids in batched_ids:
+        class_names.append([])
+        for per_token_id in per_example_ids.numpy().tolist():
+          class_names[-1].append(self.task_config.class_names[per_token_id])
+      return class_names
+    # Convert id to class names, because `seqeval_metrics` relies on the class
+    # name to decide IOB tags.
+    state['predict_class'].extend(id_to_class_name(step_outputs['predict_ids']))
+    state['label_class'].extend(id_to_class_name(step_outputs['label_ids']))
+    return state
+  def reduce_aggregated_logs(self, aggregated_logs):
+    """Reduces aggregated logs over validation steps."""
+    label_class = aggregated_logs['label_class']
+    predict_class = aggregated_logs['predict_class']
+    return {
+        'f1':
+            seqeval_metrics.f1_score(label_class, predict_class),
+        'precision':
+            seqeval_metrics.precision_score(label_class, predict_class),
+        'recall':
+            seqeval_metrics.recall_score(label_class, predict_class),
+        'accuracy':
+            seqeval_metrics.accuracy_score(label_class, predict_class),
+    }
+def predict(task: TaggingTask, params: cfg.DataConfig,
+            model: tf.keras.Model) -> Tuple[List[List[int]], List[int]]:
+  """Predicts on the input data.
+  Args:
+    task: A `TaggingTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+  Returns:
+    A tuple of `predict_ids` and `sentence_ids`, which are list with length
+      of `num_examples`. Each element in `predict_ids` is a sequence of
+      predicted per-word label id, and each element in `sentence_ids` is the
+      sentence id of the corresponding example.
+  """
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+    def _replicated_step(inputs):
+      """Replicated prediction calculation."""
+      x, y = inputs
+      sentence_ids = x.pop('sentence_id')
+      outputs = task.inference_step(x, model)
+      predict_ids = outputs['predict_ids']
+      label_mask = tf.greater_equal(y, 0)
+      return dict(
+          predict_ids=predict_ids,
+          label_mask=label_mask,
+          sentence_ids=sentence_ids)
+    outputs = tf.distribute.get_strategy().run(
+        _replicated_step, args=(next(iterator),))
+    return tf.nest.map_structure(
+        tf.distribute.get_strategy().experimental_local_results, outputs)
+  def reduce_fn(state, outputs):
+    """Concatenates model's outputs."""
+    cur_predict_ids, cur_sentence_ids = state
+    for batch_predict_ids, batch_label_mask, batch_sentence_ids in zip(
+        outputs['predict_ids'], outputs['label_mask'],
+        outputs['sentence_ids']):
+      for tmp_predict_ids, tmp_label_mask, tmp_sentence_id in zip(
+          batch_predict_ids.numpy(), batch_label_mask.numpy(),
+          batch_sentence_ids.numpy()):
+        cur_sentence_ids.append(tmp_sentence_id)
+        cur_predict_ids.append([])
+        assert len(tmp_predict_ids) == len(tmp_label_mask)
+        for i in range(len(tmp_predict_ids)):
+          # Skip the padding label.
+          if tmp_label_mask[i]:
+            cur_predict_ids[-1].append(tmp_predict_ids[i])
+    return cur_predict_ids, cur_sentence_ids
+  loop_fn = orbit.utils.create_loop_fn(predict_step)
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  # Set `num_steps` to -1 to exhaust the dataset.
+  predict_ids, sentence_ids = loop_fn(
+      iter(dataset), num_steps=-1, state=([], []), reduce_fn=reduce_fn)
+  return predict_ids, sentence_ids
--- a/official/nlp/tasks/tagging_test.py
+++ b/official/nlp/tasks/tagging_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.nlp.tasks.tagging."""
+import functools
+import os
+import numpy as np
+import tensorflow as tf
+from official.nlp.bert import configs
+from official.nlp.bert import export_tfhub
+from official.nlp.configs import encoders
+from official.nlp.data import tagging_data_loader
+from official.nlp.tasks import tagging
+def _create_fake_dataset(output_path, seq_length, num_labels, num_examples):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+  def create_int_feature(values):
+    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+    return f
+  for i in range(num_examples):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+    features["label_ids"] = create_int_feature(
+        np.random.random_integers(-1, num_labels - 1, size=(seq_length)))
+    features["sentence_id"] = create_int_feature([i])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+class TaggingTest(tf.test.TestCase):
+  def setUp(self):
+    super(TaggingTest, self).setUp()
+    self._encoder_config = encoders.TransformerEncoderConfig(
+        vocab_size=30522, num_layers=1)
+    self._train_data_config = tagging_data_loader.TaggingDataConfig(
+        input_path="dummy", seq_length=128, global_batch_size=1)
+  def _run_task(self, config):
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    strategy = tf.distribute.get_strategy()
+    dataset = strategy.experimental_distribute_datasets_from_function(
+        functools.partial(task.build_inputs, config.train_data))
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+  def test_task(self):
+    # Saves a checkpoint.
+    encoder = encoders.instantiate_encoder_from_cfg(self._encoder_config)
+    ckpt = tf.train.Checkpoint(encoder=encoder)
+    saved_path = ckpt.save(self.get_temp_dir())
+    config = tagging.TaggingConfig(
+        init_checkpoint=saved_path,
+        model=tagging.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+    task.initialize(model)
+  def test_task_with_fit(self):
+    config = tagging.TaggingConfig(
+        model=tagging.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    model = task.compile_model(
+        model,
+        optimizer=tf.keras.optimizers.SGD(lr=0.1),
+        train_step=task.train_step,
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
+    dataset = task.build_inputs(config.train_data)
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn("loss", logs.history)
+    self.assertIn("accuracy", logs.history)
+  def _export_bert_tfhub(self):
+    bert_config = configs.BertConfig(
+        vocab_size=30522,
+        hidden_size=16,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=1)
+    _, encoder = export_tfhub.create_bert_model(bert_config)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(model=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+    vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt")
+    with tf.io.gfile.GFile(vocab_file, "w") as f:
+      f.write("dummy content")
+    hub_destination = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path,
+                                   hub_destination, vocab_file)
+    return hub_destination
+  def test_task_with_hub(self):
+    hub_module_url = self._export_bert_tfhub()
+    config = tagging.TaggingConfig(
+        hub_module_url=hub_module_url,
+        class_names=["O", "B-PER", "I-PER"],
+        train_data=self._train_data_config)
+    self._run_task(config)
+  def test_seqeval_metrics(self):
+    config = tagging.TaggingConfig(
+        model=tagging.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    strategy = tf.distribute.get_strategy()
+    distributed_outputs = strategy.run(
+        functools.partial(task.validation_step, model=model),
+        args=(next(iterator),))
+    outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                    distributed_outputs)
+    aggregated = task.aggregate_logs(step_outputs=outputs)
+    aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
+    self.assertCountEqual({"f1", "precision", "recall", "accuracy"},
+                          task.reduce_aggregated_logs(aggregated).keys())
+  def test_predict(self):
+    task_config = tagging.TaggingConfig(
+        model=tagging.ModelConfig(encoder=self._encoder_config),
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(task_config)
+    model = task.build_model()
+    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
+    seq_length = 16
+    num_examples = 100
+    _create_fake_dataset(
+        test_data_path,
+        seq_length=seq_length,
+        num_labels=len(task_config.class_names),
+        num_examples=num_examples)
+    test_data_config = tagging_data_loader.TaggingDataConfig(
+        input_path=test_data_path,
+        seq_length=seq_length,
+        is_training=False,
+        global_batch_size=16,
+        drop_remainder=False,
+        include_sentence_id=True)
+    predict_ids, sentence_ids = tagging.predict(task, test_data_config, model)
+    self.assertLen(predict_ids, num_examples)
+    self.assertLen(sentence_ids, num_examples)
+if __name__ == "__main__":
+  tf.test.main()
--- a/research/compression/entropy_coder/lib/blocks_binarizer.py
+++ b/research/compression/entropy_coder/lib/blocks_binarizer.py
-# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,24 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Common utils for tasks."""
-"""Activation and weight binarizer implementations."""
-import math
-import numpy as np
 import tensorflow as tf
+import tensorflow_hub as hub
-def ConvertSignCodeToZeroOneCode(x):
-  """Conversion from codes {-1, +1} to codes {0, 1}."""
+def get_encoder_from_hub(hub_module: str) -> tf.keras.Model:
-  return 0.5 * (x + 1.0)
+  """Gets an encoder from hub."""
+  input_word_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_word_ids')
-def ConvertZeroOneCodeToSignCode(x):
+  input_mask = tf.keras.layers.Input(
-  """Convert from codes {0, 1} to codes {-1, +1}."""
+      shape=(None,), dtype=tf.int32, name='input_mask')
-  return 2.0 * x - 1.0
+  input_type_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_type_ids')
+  hub_layer = hub.KerasLayer(hub_module, trainable=True)
-def CheckZeroOneCode(x):
+  pooled_output, sequence_output = hub_layer(
-  return tf.reduce_all(tf.equal(x * (x - 1.0), 0))
+      [input_word_ids, input_mask, input_type_ids])
+  return tf.keras.Model(
+      inputs=[input_word_ids, input_mask, input_type_ids],
+      outputs=[sequence_output, pooled_output])
--- a/official/nlp/transformer/beam_search.py
+++ b/official/nlp/transformer/beam_search.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Beam search in TF v2."""
-import tensorflow as tf
-from official.nlp.transformer import beam_search_v1 as v1
-_StateKeys = v1._StateKeys  # pylint: disable=protected-access
-class SequenceBeamSearchV2(v1.SequenceBeamSearch):
-  """Implementation of beam search loop in v2."""
-  def search(self, initial_ids, initial_cache):
-    """Beam search for sequences with highest scores."""
-    state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
-    finished_state = tf.nest.map_structure(
-        tf.stop_gradient,
-        tf.while_loop(self._continue_search,
-                      self._search_step,
-                      loop_vars=[state],
-                      shape_invariants=[state_shapes],
-                      parallel_iterations=1))
-    finished_state = finished_state[0]
-    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
-    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
-    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
-    finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
-    finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
-    # 2.0 changes tf.where behavior. Should make parameters broadcastable.
-    finished_cond = tf.reduce_any(finished_flags, 1, name="finished_cond")
-    seq_cond = _expand_to_same_rank(finished_cond, finished_seq)
-    score_cond = _expand_to_same_rank(finished_cond, finished_scores)
-    # Account for corner case where there are no finished sequences for a
-    # particular batch item. In that case, return alive sequences for that batch
-    # item.
-    finished_seq = tf.where(seq_cond, finished_seq, alive_seq)
-    finished_scores = tf.where(
-        score_cond, finished_scores, alive_log_probs)
-    return finished_seq, finished_scores
-def sequence_beam_search(symbols_to_logits_fn,
-                         initial_ids,
-                         initial_cache,
-                         vocab_size,
-                         beam_size,
-                         alpha,
-                         max_decode_length,
-                         eos_id,
-                         padded_decode=False,
-                         dtype="float32"):
-  """Search for sequence of subtoken ids with the largest probability.
-  Args:
-    symbols_to_logits_fn: A function that takes in ids, index, and cache as
-      arguments. The passed in arguments will have shape:
-        ids -> A tensor with shape [batch_size * beam_size, index].
-        index -> A scalar.
-        cache -> A nested dictionary of tensors [batch_size * beam_size, ...].
-      The function must return a tuple of logits and new cache:
-        logits -> A tensor with shape [batch * beam_size, vocab_size].
-        new cache -> A nested dictionary with the same shape/structure as the
-          inputted cache.
-    initial_ids: An int32 tensor with shape [batch_size]. Starting ids for
-      each batch item.
-    initial_cache: A dictionary, containing starting decoder variables
-      information.
-    vocab_size: An integer, the size of tokens.
-    beam_size: An integer, the number of beams.
-    alpha: A float, defining the strength of length normalization.
-    max_decode_length: An integer, the maximum length to decoded a sequence.
-    eos_id: An integer, ID of eos token, used to determine when a sequence has
-      finished.
-    padded_decode: A bool, indicating if max_sequence_length padding is used
-      for beam search.
-    dtype: A tensorflow data type used for score computation. The default is
-      tf.float32.
-  Returns:
-    Top decoded sequences [batch_size, beam_size, max_decode_length]
-    sequence scores [batch_size, beam_size]
-  """
-  batch_size = (
-      initial_ids.shape.as_list()[0] if padded_decode else
-      tf.shape(initial_ids)[0])
-  sbs = SequenceBeamSearchV2(symbols_to_logits_fn, vocab_size, batch_size,
-                             beam_size, alpha, max_decode_length, eos_id,
-                             padded_decode, dtype)
-  return sbs.search(initial_ids, initial_cache)
-def _expand_to_same_rank(tensor, target):
-  """Expands a given tensor to target's rank to be broadcastable.
-  Args:
-    tensor: input tensor to tile. Shape: [b, d1, ..., da]
-    target: target tensor. Shape: [b, d1, ..., da, ..., dn]
-  Returns:
-    Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
-  Raises:
-    ValueError, if the shape rank of rank tensor/target is None.
-  """
-  if tensor.shape.rank is None:
-    raise ValueError("Expect rank for tensor shape, but got None.")
-  if target.shape.rank is None:
-    raise ValueError("Expect rank for target shape, but got None.")
-  with tf.name_scope("expand_rank"):
-    diff_rank = target.shape.rank - tensor.shape.rank
-    for _ in range(diff_rank):
-      tensor = tf.expand_dims(tensor, -1)
-    return tensor
--- a/official/nlp/transformer/beam_search_v1.py
+++ b/official/nlp/transformer/beam_search_v1.py
@@ -13,126 +13,18 @@
 # limitations under the License.
 # ==============================================================================
 """Beam search to find the translated sequence with the highest probability.
-Source implementation from Tensor2Tensor:
-https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/beam_search.py
 """
-import numpy as np
 import tensorflow.compat.v1 as tf
-from tensorflow.python.util import nest
+from official.nlp.modeling.ops import beam_search
-def inf(dtype):
-  """Returns a value close to infinity, but is still finite in `dtype`.
-  This is useful to get a very large value that is still zero when multiplied by
-  zero. The floating-point "Inf" value is NaN when multiplied by zero.
-  Args:
-    dtype: A dtype. The returned value will be finite when casted to this dtype.
-  Returns:
-    A very large value.
-  """
-  if dtype == "float32" or dtype == "bfloat16":
-    return 1e7
-  elif dtype == "float16":
-    # Disable no-member lint error, as the linter thinks np.float16 does not
-    # exist for some reason.
-    return np.finfo(np.float16).max  # pylint: disable=no-member
-  else:
-    raise AssertionError('Invalid dtype: %s' % dtype)
-class _StateKeys(object):
-  """Keys to dictionary storing the state of the beam search loop."""
-  # Variable storing the loop index.
-  CUR_INDEX = "CUR_INDEX"
-  # Top sequences that are alive for each batch item. Alive sequences are ones
+_StateKeys = beam_search._StateKeys  # pylint: disable=protected-access
-  # that have not generated an EOS token. Sequences that reach EOS are marked as
-  # finished and moved to the FINISHED_SEQ tensor.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1]
-  ALIVE_SEQ = "ALIVE_SEQ"
-  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
-  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
-  # Dictionary of cached values for each alive sequence. The cache stores
-  # the encoder output, attention bias, and the decoder attention output from
-  # the previous iteration.
-  ALIVE_CACHE = "ALIVE_CACHE"
-  # Top finished sequences for each batch item.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
-  # shorter than CUR_INDEX + 1 are padded with 0s.
-  FINISHED_SEQ = "FINISHED_SEQ"
-  # Scores for each finished sequence. Score = log probability / length norm
-  # Shape [batch_size, beam_size]
-  FINISHED_SCORES = "FINISHED_SCORES"
-  # Flags indicating which sequences in the finished sequences are finished.
-  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
-  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
-  FINISHED_FLAGS = "FINISHED_FLAGS"
+class SequenceBeamSearch(beam_search.SequenceBeamSearch):
-class SequenceBeamSearch(object):
  """Implementation of beam search loop."""
-  def __init__(self,
+  def _process_finished_state(self, finished_state):
-               symbols_to_logits_fn,
-               vocab_size,
-               batch_size,
-               beam_size,
-               alpha,
-               max_decode_length,
-               eos_id,
-               padded_decode,
-               dtype=tf.float32):
-    """Initialize sequence beam search.
-    Args:
-      symbols_to_logits_fn: A function to provide logits, which is the
-        interface to the Transformer model. The passed in arguments are:
-          ids -> A tensor with shape [batch_size * beam_size, index].
-          index -> A scalar.
-          cache -> A nested dictionary of tensors [batch_size * beam_size, ...].
-        The function must return a tuple of logits and the updated cache:
-          logits -> A tensor with shape [batch * beam_size, vocab_size].
-          updated cache -> A nested dictionary with the same structure as the
-            input cache.
-      vocab_size: An integer, the size of the vocabulary, used for topk
-        computation.
-      batch_size: An integer, the decode batch size.
-      beam_size: An integer, number of beams for beam search.
-      alpha: A float, defining the strength of length normalization.
-      max_decode_length: An integer, the maximum number of steps to decode
-        a sequence.
-      eos_id: An integer. ID of end of sentence token.
-      padded_decode: A bool, indicating if max_sequence_length padding is used
-        for beam search.
-      dtype: A tensorflow data type used for score computation. The default is
-        tf.float32.
-    """
-    self.symbols_to_logits_fn = symbols_to_logits_fn
-    self.vocab_size = vocab_size
-    self.batch_size = batch_size
-    self.beam_size = beam_size
-    self.alpha = alpha
-    self.max_decode_length = max_decode_length
-    self.eos_id = eos_id
-    self.padded_decode = padded_decode
-    self.dtype = tf.as_dtype(dtype)
-  def search(self, initial_ids, initial_cache):
-    """Beam search for sequences with highest scores."""
-    state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
-    finished_state = tf.while_loop(
-        self._continue_search, self._search_step, loop_vars=[state],
-        shape_invariants=[state_shapes], parallel_iterations=1, back_prop=False)
-    finished_state = finished_state[0]
    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
@@ -148,360 +40,6 @@ class SequenceBeamSearch(object):
        tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
    return finished_seq, finished_scores
-  def _create_initial_state(self, initial_ids, initial_cache):
-    """Return initial state dictionary and its shape invariants.
-    Args:
-      initial_ids: initial ids to pass into the symbols_to_logits_fn.
-        int tensor with shape [batch_size, 1]
-      initial_cache: dictionary storing values to be passed into the
-        symbols_to_logits_fn.
-    Returns:
-        state and shape invariant dictionaries with keys from _StateKeys
-    """
-    for key, value in initial_cache.items():
-      for inner_value in nest.flatten(value):
-        if inner_value.dtype != self.dtype:
-          raise TypeError(
-              "initial_cache element for key '%s' has dtype %s that does not "
-              "match SequenceBeamSearch's dtype of %s. Value: %s" %
-              (key, value.dtype.name, self.dtype.name, inner_value))
-    # Current loop index (starts at 0)
-    cur_index = tf.constant(0)
-    # Create alive sequence with shape [batch_size, beam_size, 1]
-    alive_seq = _expand_to_beam_size(initial_ids, self.beam_size)
-    alive_seq = tf.expand_dims(alive_seq, axis=2)
-    if self.padded_decode:
-      alive_seq = tf.tile(alive_seq, [1, 1, self.max_decode_length + 1])
-    # Create tensor for storing initial log probabilities.
-    # Assume initial_ids are prob 1.0
-    initial_log_probs = tf.constant(
-        [[0.] + [-float("inf")] * (self.beam_size - 1)], dtype=self.dtype)
-    alive_log_probs = tf.tile(initial_log_probs, [self.batch_size, 1])
-    # Expand all values stored in the dictionary to the beam size, so that each
-    # beam has a separate cache.
-    alive_cache = nest.map_structure(
-        lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
-    # Initialize tensor storing finished sequences with filler values.
-    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
-    # Set scores of the initial finished seqs to negative infinity.
-    finished_scores = tf.ones([self.batch_size, self.beam_size],
-                              dtype=self.dtype) * -inf(self.dtype)
-    # Initialize finished flags with all False values.
-    finished_flags = tf.zeros([self.batch_size, self.beam_size], tf.bool)
-    # Create state dictionary
-    state = {
-        _StateKeys.CUR_INDEX: cur_index,
-        _StateKeys.ALIVE_SEQ: alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
-        _StateKeys.ALIVE_CACHE: alive_cache,
-        _StateKeys.FINISHED_SEQ: finished_seq,
-        _StateKeys.FINISHED_SCORES: finished_scores,
-        _StateKeys.FINISHED_FLAGS: finished_flags
-    }
-    # Create state invariants for each value in the state dictionary. Each
-    # dimension must be a constant or None. A None dimension means either:
-    #   1) the dimension's value is a tensor that remains the same but may
-    #      depend on the input sequence to the model (e.g. batch size).
-    #   2) the dimension may have different values on different iterations.
-    if self.padded_decode:
-      state_shape_invariants = {
-          _StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ:
-              tf.TensorShape(
-                  [self.batch_size, self.beam_size,
-                   self.max_decode_length + 1]),
-          _StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([self.batch_size, self.beam_size]),
-          _StateKeys.ALIVE_CACHE:
-              nest.map_structure(_get_shape, alive_cache),
-          _StateKeys.FINISHED_SEQ:
-              tf.TensorShape(
-                  [self.batch_size, self.beam_size,
-                   self.max_decode_length + 1]),
-          _StateKeys.FINISHED_SCORES:
-              tf.TensorShape([self.batch_size, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([self.batch_size, self.beam_size])
-      }
-    else:
-      state_shape_invariants = {
-          _StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ:
-              tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([None, self.beam_size]),
-          _StateKeys.ALIVE_CACHE:
-              nest.map_structure(_get_shape_keep_last_dim, alive_cache),
-          _StateKeys.FINISHED_SEQ:
-              tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.FINISHED_SCORES:
-              tf.TensorShape([None, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([None, self.beam_size])
-      }
-    return state, state_shape_invariants
-  def _continue_search(self, state):
-    """Return whether to continue the search loop.
-    The loops should terminate when
-      1) when decode length has been reached, or
-      2) when the worst score in the finished sequences is better than the best
-         score in the alive sequences (i.e. the finished sequences are provably
-         unchanging)
-    Args:
-      state: A dictionary with the current loop state.
-    Returns:
-      Bool tensor with value True if loop should continue, False if loop should
-      terminate.
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
-    finished_scores = state[_StateKeys.FINISHED_SCORES]
-    finished_flags = state[_StateKeys.FINISHED_FLAGS]
-    not_at_max_decode_length = tf.less(i, self.max_decode_length)
-    # Calculate largest length penalty (the larger penalty, the better score).
-    max_length_norm = _length_normalization(self.alpha, self.max_decode_length,
-                                            dtype=self.dtype)
-    # Get the best possible scores from alive sequences.
-    best_alive_scores = alive_log_probs[:, 0] / max_length_norm
-    # Compute worst score in finished sequences for each batch element
-    finished_scores *= tf.cast(finished_flags,
-                               self.dtype)  # set filler scores to zero
-    lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
-    # If there are no finished sequences in a batch element, then set the lowest
-    # finished score to -INF for that element.
-    finished_batches = tf.reduce_any(finished_flags, 1)
-    lowest_finished_scores += ((1.0 -
-                                tf.cast(finished_batches, self.dtype)) *
-                               -inf(self.dtype))
-    worst_finished_score_better_than_best_alive_score = tf.reduce_all(
-        tf.greater(lowest_finished_scores, best_alive_scores)
-    )
-    return tf.logical_and(
-        not_at_max_decode_length,
-        tf.logical_not(worst_finished_score_better_than_best_alive_score)
-    )
-  def _search_step(self, state):
-    """Beam search loop body.
-    Grow alive sequences by a single ID. Sequences that have reached the EOS
-    token are marked as finished. The alive and finished sequences with the
-    highest log probabilities and scores are returned.
-    A sequence's finished score is calculating by dividing the log probability
-    by the length normalization factor. Without length normalization, the
-    search is more likely to return shorter sequences.
-    Args:
-      state: A dictionary with the current loop state.
-    Returns:
-      new state dictionary.
-    """
-    # Grow alive sequences by one token.
-    new_seq, new_log_probs, topk_ids, new_cache = self._grow_alive_seq(state)
-    new_finished_flags = tf.equal(topk_ids, self.eos_id)
-    # Collect top beam_size alive sequences
-    alive_state = self._get_new_alive_state(new_seq, new_log_probs,
-                                            new_finished_flags, new_cache)
-    # Combine newly finished sequences with existing finished sequences, and
-    # collect the top k scoring sequences.
-    finished_state = self._get_new_finished_state(state, new_seq, new_log_probs,
-                                                  new_finished_flags)
-    # Increment loop index and create new state dictionary
-    new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
-    new_state.update(alive_state)
-    new_state.update(finished_state)
-    return [new_state]
-  def _grow_alive_seq(self, state):
-    """Grow alive sequences by one token, and collect top 2*beam_size sequences.
-    2*beam_size sequences are collected because some sequences may have reached
-    the EOS token. 2*beam_size ensures that at least beam_size sequences are
-    still alive.
-    Args:
-      state: A dictionary with the current loop state.
-    Returns:
-      Tuple of
-      (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
-       Scores of returned sequences [batch_size, 2 * beam_size],
-       New alive cache, for each of the 2 * beam_size sequences)
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    alive_seq = state[_StateKeys.ALIVE_SEQ]
-    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
-    alive_cache = state[_StateKeys.ALIVE_CACHE]
-    beams_to_keep = 2 * self.beam_size
-    # Get logits for the next candidate IDs for the alive sequences. Get the new
-    # cache values at the same time.
-    if self.padded_decode:
-      flat_ids = tf.reshape(
-          tf.slice(alive_seq, [0, 0, i], [self.batch_size, self.beam_size, 1]),
-          [self.batch_size * self.beam_size, -1])
-    else:
-      flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
-    flat_cache = nest.map_structure(_flatten_beam_dim, alive_cache)
-    flat_logits, flat_cache = self.symbols_to_logits_fn(flat_ids, i, flat_cache)
-    # Unflatten logits to shape [batch_size, beam_size, vocab_size]
-    logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size)
-    new_cache = nest.map_structure(
-        lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size),
-        flat_cache)
-    # Convert logits to normalized log probs
-    candidate_log_probs = _log_prob_from_logits(logits)
-    # Calculate new log probabilities if each of the alive sequences were
-    # extended # by the the candidate IDs.
-    # Shape [batch_size, beam_size, vocab_size]
-    log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
-    # Each batch item has beam_size * vocab_size candidate sequences. For each
-    # batch item, get the k candidates with the highest log probabilities.
-    flat_log_probs = tf.reshape(log_probs,
-                                [-1, self.beam_size * self.vocab_size])
-    topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep)
-    # Extract the alive sequences that generate the highest log probabilities
-    # after being extended.
-    topk_beam_indices = topk_indices // self.vocab_size
-    topk_seq, new_cache = _gather_beams(
-        [alive_seq, new_cache], topk_beam_indices, self.batch_size,
-        beams_to_keep)
-    # Append the most probable IDs to the topk sequences
-    topk_ids = topk_indices % self.vocab_size
-    if self.padded_decode:
-      topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
-      # TODO(b/145533236, hongkuny): Reverts once TF fix the validation.
-      topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]],
-                                             tf.expand_dims(topk_ids, axis=0))
-      topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
-    else:
-      topk_seq = tf.concat([topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
-    return topk_seq, topk_log_probs, topk_ids, new_cache
-  def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags,
-                           new_cache):
-    """Gather the top k sequences that are still alive.
-    Args:
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
-      new_log_probs: Log probabilities of new sequences float32 tensor with
-        shape [batch_size, beam_size]
-      new_finished_flags: A boolean Tensor indicates which sequences are live
-        inside the beam.
-      new_cache: Dict of cached values for each sequence.
-    Returns:
-      Dictionary with alive keys from _StateKeys:
-        {Top beam_size sequences that are still alive (don't end with eos_id)
-         Log probabilities of top alive sequences
-         Dict cache storing decoder states for top alive sequences}
-    """
-    # To prevent finished sequences from being considered, set log probs to -inf
-    new_log_probs += tf.cast(new_finished_flags, self.dtype) * -inf(self.dtype)
-    top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
-        [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size,
-        self.beam_size)
-    return {
-        _StateKeys.ALIVE_SEQ: top_alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
-        _StateKeys.ALIVE_CACHE: top_alive_cache
-    }
-  def _get_new_finished_state(self, state, new_seq, new_log_probs,
-                              new_finished_flags):
-    """Combine new and old finished sequences, and gather the top k sequences.
-    Args:
-      state: A dictionary with the current loop state.
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape [batch_size, beam_size, i + 1]
-      new_log_probs: Log probabilities of new sequences float32 tensor with
-        shape [batch_size, beam_size]
-      new_finished_flags: A boolean Tensor indicates which sequences are live
-        inside the beam.
-    Returns:
-      Dictionary with finished keys from _StateKeys:
-        {Top beam_size finished sequences based on score,
-         Scores of finished sequences,
-         Finished flags of finished sequences}
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    finished_seq = state[_StateKeys.FINISHED_SEQ]
-    finished_scores = state[_StateKeys.FINISHED_SCORES]
-    finished_flags = state[_StateKeys.FINISHED_FLAGS]
-    # First append a column of 0-ids to finished_seq to increment the length.
-    # New shape of finished_seq: [batch_size, beam_size, i + 1]
-    if not self.padded_decode:
-      finished_seq = tf.concat([
-          finished_seq,
-          tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)
-      ],
-                               axis=2)
-    # Calculate new seq scores from log probabilities.
-    length_norm = _length_normalization(self.alpha, i + 1, dtype=self.dtype)
-    new_scores = new_log_probs / length_norm
-    # Set the scores of the still-alive seq in new_seq to large negative values.
-    new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) *
-                   -inf(self.dtype))
-    # Combine sequences, scores, and flags.
-    finished_seq = tf.concat([finished_seq, new_seq], axis=1)
-    finished_scores = tf.concat([finished_scores, new_scores], axis=1)
-    finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
-    # Return the finished sequences with the best scores.
-    top_finished_seq, top_finished_scores, top_finished_flags = (
-        _gather_topk_beams([finished_seq, finished_scores, finished_flags],
-                           finished_scores, self.batch_size, self.beam_size))
-    return {
-        _StateKeys.FINISHED_SEQ: top_finished_seq,
-        _StateKeys.FINISHED_SCORES: top_finished_scores,
-        _StateKeys.FINISHED_FLAGS: top_finished_flags
-    }
 def sequence_beam_search(
    symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size,
@@ -536,140 +74,6 @@ def sequence_beam_search(
    Top decoded sequences [batch_size, beam_size, max_decode_length]
    sequence scores [batch_size, beam_size]
  """
-  batch_size = (
+  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, beam_size, alpha,
-      initial_ids.shape.as_list()[0] if padded_decode else
+                           max_decode_length, eos_id, padded_decode)
-      tf.shape(initial_ids)[0])
-  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size,
-                           beam_size, alpha, max_decode_length, eos_id,
-                           padded_decode)
  return sbs.search(initial_ids, initial_cache)
-def _log_prob_from_logits(logits):
-  return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
-def _length_normalization(alpha, length, dtype=tf.float32):
-  """Return length normalization factor."""
-  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), alpha)
-def _expand_to_beam_size(tensor, beam_size):
-  """Tiles a given tensor by beam_size.
-  Args:
-    tensor: tensor to tile [batch_size, ...]
-    beam_size: How much to tile the tensor by.
-  Returns:
-    Tiled tensor [batch_size, beam_size, ...]
-  """
-  tensor = tf.expand_dims(tensor, axis=1)
-  tile_dims = [1] * tensor.shape.ndims
-  tile_dims[1] = beam_size
-  return tf.tile(tensor, tile_dims)
-def _shape_list(tensor):
-  """Return a list of the tensor's shape, and ensure no None values in list."""
-  # Get statically known shape (may contain None's for unknown dimensions)
-  shape = tensor.get_shape().as_list()
-  # Ensure that the shape values are not None
-  dynamic_shape = tf.shape(tensor)
-  for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
-    if shape[i] is None:
-      shape[i] = dynamic_shape[i]
-  return shape
-def _get_shape_keep_last_dim(tensor):
-  shape_list = _shape_list(tensor)
-  # Only the last
-  for i in range(len(shape_list) - 1):
-    shape_list[i] = None
-  if isinstance(shape_list[-1], tf.Tensor):
-    shape_list[-1] = None
-  return tf.TensorShape(shape_list)
-def _get_shape(tensor):
-  """Return the shape of the input tensor."""
-  return tf.TensorShape(_shape_list(tensor))
-def _flatten_beam_dim(tensor):
-  """Reshapes first two dimensions in to single dimension.
-  Args:
-    tensor: Tensor to reshape of shape [A, B, ...]
-  Returns:
-    Reshaped tensor of shape [A*B, ...]
-  """
-  shape = _shape_list(tensor)
-  shape[0] *= shape[1]
-  shape.pop(1)  # Remove beam dim
-  return tf.reshape(tensor, shape)
-def _unflatten_beam_dim(tensor, batch_size, beam_size):
-  """Reshapes first dimension back to [batch_size, beam_size].
-  Args:
-    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
-    batch_size: Tensor, original batch size.
-    beam_size: int, original beam size.
-  Returns:
-    Reshaped tensor of shape [batch_size, beam_size, ...]
-  """
-  shape = _shape_list(tensor)
-  new_shape = [batch_size, beam_size] + shape[1:]
-  return tf.reshape(tensor, new_shape)
-def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
-  """Gather beams from nested structure of tensors.
-  Each tensor in nested represents a batch of beams, where beam refers to a
-  single search state (beam search involves searching through multiple states
-  in parallel).
-  This function is used to gather the top beams, specified by
-  beam_indices, from the nested tensors.
-  Args:
-    nested: Nested structure (tensor, list, tuple or dict) containing tensors
-      with shape [batch_size, beam_size, ...].
-    beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
-     value in beam_indices must be between [0, beam_size), and are not
-     necessarily unique.
-    batch_size: int size of batch
-    new_beam_size: int number of beams to be pulled from the nested tensors.
-  Returns:
-    Nested structure containing tensors with shape
-      [batch_size, new_beam_size, ...]
-  """
-  # Computes the i'th coodinate that contains the batch index for gather_nd.
-  # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
-  batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
-  batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
-  # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
-  # with shape [batch_size, beam_size, 2], where the last dimension contains
-  # the (i, j) gathering coordinates.
-  coordinates = tf.stack([batch_pos, beam_indices], axis=2)
-  return nest.map_structure(
-      lambda state: tf.gather_nd(state, coordinates), nested)
-def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
-  """Gather top beams from nested structure."""
-  _, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size)
-  return _gather_beams(nested, topk_indexes, batch_size, beam_size)
--- a/official/nlp/transformer/compute_bleu.py
+++ b/official/nlp/transformer/compute_bleu.py
@@ -26,7 +26,7 @@ import re
 import sys
 import unicodedata
-from absl import app as absl_app
+from absl import app
 from absl import flags
 import six
 from six.moves import range
@@ -92,7 +92,11 @@ def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
      tf.io.gfile.GFile(ref_filename).read()).strip().splitlines()
  hyp_lines = tokenizer.native_to_unicode(
      tf.io.gfile.GFile(hyp_filename).read()).strip().splitlines()
+  return bleu_on_list(ref_lines, hyp_lines, case_sensitive)
+def bleu_on_list(ref_lines, hyp_lines, case_sensitive=False):
+  """Compute BLEU for two list of strings (reference and hypothesis)."""
  if len(ref_lines) != len(hyp_lines):
    raise ValueError(
        "Reference and translation files have different number of "
@@ -145,4 +149,4 @@ if __name__ == "__main__":
  tf.logging.set_verbosity(tf.logging.INFO)
  define_compute_bleu_flags()
  FLAGS = flags.FLAGS
-  absl_app.run(main)
+  app.run(main)