Merge remote-tracking branch 'upstream/master' into add_multilevel_crop_and_resize

47bc1813 · syiming · d8611151 · b035a227 · 47bc1813 · 47bc1813
Commit 47bc1813 authored Jul 01, 2020 by syiming
20 changed files
--- a/official/nlp/modeling/models/bert_pretrainer_test.py
+++ b/official/nlp/modeling/models/bert_pretrainer_test.py
@@ -50,16 +50,19 @@ class BertPretrainerTest(keras_parameterized.TestCase):
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    lm_mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    masked_lm_positions = tf.keras.Input(
+        shape=(num_token_predictions,), dtype=tf.int32)

    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    lm_outs, cls_outs = bert_trainer_model([word_ids, mask, type_ids, lm_mask])
+    outputs = bert_trainer_model(
+        [word_ids, mask, type_ids, masked_lm_positions])

    # Validate that the outputs are of the expected shape.
    expected_lm_shape = [None, num_token_predictions, vocab_size]
    expected_classification_shape = [None, num_classes]
-    self.assertAllEqual(expected_lm_shape, lm_outs.shape.as_list())
-    self.assertAllEqual(expected_classification_shape, cls_outs.shape.as_list())
+    self.assertAllEqual(expected_lm_shape, outputs['masked_lm'].shape.as_list())
+    self.assertAllEqual(expected_classification_shape,
+                        outputs['classification'].shape.as_list())

  def test_bert_trainer_tensor_call(self):
    """Validate that the Keras object can be invoked."""
@@ -81,7 +84,7 @@ class BertPretrainerTest(keras_parameterized.TestCase):
    # Invoke the trainer model on the tensors. In Eager mode, this does the
    # actual calculation. (We can't validate the outputs, since the network is
    # too complex: this simply ensures we're not hitting runtime errors.)
-    _, _ = bert_trainer_model([word_ids, mask, type_ids, lm_mask])
+    _ = bert_trainer_model([word_ids, mask, type_ids, lm_mask])

  def test_serialize_deserialize(self):
    """Validate that the BERT trainer can be serialized and deserialized."""
@@ -123,7 +126,7 @@ class BertPretrainerTest(keras_parameterized.TestCase):
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    lm_mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    lm_mask = tf.keras.Input(shape=(num_token_predictions,), dtype=tf.int32)

    # Invoke the trainer model on the inputs. This causes the layer to be built.
    outputs = bert_trainer_model([word_ids, mask, type_ids, lm_mask])

--- a/official/nlp/modeling/models/bert_span_labeler.py
+++ b/official/nlp/modeling/models/bert_span_labeler.py
@@ -51,11 +51,13 @@ class BertSpanLabeler(tf.keras.Model):
               output='logits',
               **kwargs):
    self._self_setattr_tracking = False
+    self._network = network
    self._config = {
        'network': network,
        'initializer': initializer,
        'output': output,
    }
+
    # We want to use the inputs of the passed network as the inputs to this
    # Model. To do this, we need to keep a handle to the network inputs for use
    # when we construct the Model object at the end of init.
@@ -89,6 +91,10 @@ class BertSpanLabeler(tf.keras.Model):
    super(BertSpanLabeler, self).__init__(
        inputs=inputs, outputs=logits, **kwargs)

+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
+
  def get_config(self):
    return self._config


--- a/official/nlp/modeling/models/bert_token_classifier.py
+++ b/official/nlp/modeling/models/bert_token_classifier.py
@@ -55,6 +55,7 @@ class BertTokenClassifier(tf.keras.Model):
               dropout_rate=0.1,
               **kwargs):
    self._self_setattr_tracking = False
+    self._network = network
    self._config = {
        'network': network,
        'num_classes': num_classes,
@@ -84,6 +85,10 @@ class BertTokenClassifier(tf.keras.Model):
    super(BertTokenClassifier, self).__init__(
        inputs=inputs, outputs=predictions, **kwargs)

+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
+
  def get_config(self):
    return self._config


--- a/official/nlp/modeling/models/electra_pretrainer.py
+++ b/official/nlp/modeling/models/electra_pretrainer.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Trainer network for ELECTRA models."""
+# pylint: disable=g-classes-have-attributes
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import copy
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.nlp.modeling import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class ElectraPretrainer(tf.keras.Model):
+  """ELECTRA network training model.
+
+  This is an implementation of the network structure described in "ELECTRA:
+  Pre-training Text Encoders as Discriminators Rather Than Generators" (
+  https://arxiv.org/abs/2003.10555).
+
+  The ElectraPretrainer allows a user to pass in two transformer models, one for
+  generator, the other for discriminator, and instantiates the masked language
+  model (at generator side) and classification networks (at discriminator side)
+  that are used to create the training objectives.
+
+  Arguments:
+    generator_network: A transformer network for generator, this network should
+      output a sequence output and an optional classification output.
+    discriminator_network: A transformer network for discriminator, this network
+      should output a sequence output
+    vocab_size: Size of generator output vocabulary
+    num_classes: Number of classes to predict from the classification network
+      for the generator network (not used now)
+    sequence_length: Input sequence length
+    last_hidden_dim: Last hidden dim of generator transformer output
+    num_token_predictions: Number of tokens to predict from the masked LM.
+    mlm_activation: The activation (if any) to use in the masked LM and
+      classification networks. If None, no activation will be used.
+    mlm_initializer: The initializer (if any) to use in the masked LM and
+      classification networks. Defaults to a Glorot uniform initializer.
+    output_type: The output style for this network. Can be either 'logits' or
+      'predictions'.
+    disallow_correct: Whether to disallow the generator to generate the exact
+      same token in the original sentence
+  """
+
+  def __init__(self,
+               generator_network,
+               discriminator_network,
+               vocab_size,
+               num_classes,
+               sequence_length,
+               last_hidden_dim,
+               num_token_predictions,
+               mlm_activation=None,
+               mlm_initializer='glorot_uniform',
+               output_type='logits',
+               disallow_correct=False,
+               **kwargs):
+    super(ElectraPretrainer, self).__init__()
+    self._config = {
+        'generator_network': generator_network,
+        'discriminator_network': discriminator_network,
+        'vocab_size': vocab_size,
+        'num_classes': num_classes,
+        'sequence_length': sequence_length,
+        'last_hidden_dim': last_hidden_dim,
+        'num_token_predictions': num_token_predictions,
+        'mlm_activation': mlm_activation,
+        'mlm_initializer': mlm_initializer,
+        'output_type': output_type,
+        'disallow_correct': disallow_correct,
+    }
+    for k, v in kwargs.items():
+      self._config[k] = v
+
+    self.generator_network = generator_network
+    self.discriminator_network = discriminator_network
+    self.vocab_size = vocab_size
+    self.num_classes = num_classes
+    self.sequence_length = sequence_length
+    self.last_hidden_dim = last_hidden_dim
+    self.num_token_predictions = num_token_predictions
+    self.mlm_activation = mlm_activation
+    self.mlm_initializer = mlm_initializer
+    self.output_type = output_type
+    self.disallow_correct = disallow_correct
+    self.masked_lm = layers.MaskedLM(
+        embedding_table=generator_network.get_embedding_table(),
+        activation=mlm_activation,
+        initializer=mlm_initializer,
+        output=output_type,
+        name='generator_masked_lm')
+    self.classification = layers.ClassificationHead(
+        inner_dim=last_hidden_dim,
+        num_classes=num_classes,
+        initializer=mlm_initializer,
+        name='generator_classification_head')
+    self.discriminator_head = tf.keras.layers.Dense(
+        units=1, kernel_initializer=mlm_initializer)
+
+  def call(self, inputs):
+    """ELECTRA forward pass.
+
+    Args:
+      inputs: A dict of all inputs, same as the standard BERT model.
+
+    Returns:
+      outputs: A dict of pretrainer model outputs, including
+        (1) lm_outputs: a [batch_size, num_token_predictions, vocab_size] tensor
+        indicating logits on masked positions.
+        (2) sentence_outputs: a [batch_size, num_classes] tensor indicating
+        logits for nsp task.
+        (3) disc_logits: a [batch_size, sequence_length] tensor indicating
+        logits for discriminator replaced token detection task.
+        (4) disc_label: a [batch_size, sequence_length] tensor indicating
+        target labels for discriminator replaced token detection task.
+    """
+    input_word_ids = inputs['input_word_ids']
+    input_mask = inputs['input_mask']
+    input_type_ids = inputs['input_type_ids']
+    masked_lm_positions = inputs['masked_lm_positions']
+
+    ### Generator ###
+    sequence_output, cls_output = self.generator_network(
+        [input_word_ids, input_mask, input_type_ids])
+
+    # The generator encoder network may get outputs from all layers.
+    if isinstance(sequence_output, list):
+      sequence_output = sequence_output[-1]
+    if isinstance(cls_output, list):
+      cls_output = cls_output[-1]
+
+    lm_outputs = self.masked_lm(sequence_output, masked_lm_positions)
+    sentence_outputs = self.classification(sequence_output)
+
+    ### Sampling from generator ###
+    fake_data = self._get_fake_data(inputs, lm_outputs, duplicate=True)
+
+    ### Discriminator ###
+    disc_input = fake_data['inputs']
+    disc_label = fake_data['is_fake_tokens']
+    disc_sequence_output, _ = self.discriminator_network([
+        disc_input['input_word_ids'], disc_input['input_mask'],
+        disc_input['input_type_ids']
+    ])
+
+    # The discriminator encoder network may get outputs from all layers.
+    if isinstance(disc_sequence_output, list):
+      disc_sequence_output = disc_sequence_output[-1]
+
+    disc_logits = self.discriminator_head(disc_sequence_output)
+    disc_logits = tf.squeeze(disc_logits, axis=-1)
+
+    outputs = {
+        'lm_outputs': lm_outputs,
+        'sentence_outputs': sentence_outputs,
+        'disc_logits': disc_logits,
+        'disc_label': disc_label,
+    }
+
+    return outputs
+
+  def _get_fake_data(self, inputs, mlm_logits, duplicate=True):
+    """Generate corrupted data for discriminator.
+
+    Args:
+      inputs: A dict of all inputs, same as the input of call() function
+      mlm_logits: The generator's output logits
+      duplicate: Whether to copy the original inputs dict during modifications
+
+    Returns:
+      A dict of generated fake data
+    """
+    inputs = unmask(inputs, duplicate)
+
+    if self.disallow_correct:
+      disallow = tf.one_hot(
+          inputs['masked_lm_ids'], depth=self.vocab_size, dtype=tf.float32)
+    else:
+      disallow = None
+
+    sampled_tokens = tf.stop_gradient(
+        sample_from_softmax(mlm_logits, disallow=disallow))
+    sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32)
+    updated_input_ids, masked = scatter_update(inputs['input_word_ids'],
+                                               sampled_tokids,
+                                               inputs['masked_lm_positions'])
+    labels = masked * (1 - tf.cast(
+        tf.equal(updated_input_ids, inputs['input_word_ids']), tf.int32))
+
+    updated_inputs = get_updated_inputs(
+        inputs, duplicate, input_word_ids=updated_input_ids)
+
+    return {
+        'inputs': updated_inputs,
+        'is_fake_tokens': labels,
+        'sampled_tokens': sampled_tokens
+    }
+
+  def get_config(self):
+    return self._config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+
+def scatter_update(sequence, updates, positions):
+  """Scatter-update a sequence.
+
+  Args:
+    sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor
+    updates: A tensor of size batch_size*seq_len(*depth)
+    positions: A [batch_size, n_positions] tensor
+
+  Returns:
+    updated_sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth]
+      tensor of "sequence" with elements at "positions" replaced by the values
+      at "updates". Updates to index 0 are ignored. If there are duplicated
+      positions the update is only applied once.
+    updates_mask: A [batch_size, seq_len] mask tensor of which inputs were
+      updated.
+  """
+  shape = tf_utils.get_shape_list(sequence, expected_rank=[2, 3])
+  depth_dimension = (len(shape) == 3)
+  if depth_dimension:
+    batch_size, seq_len, depth = shape
+  else:
+    batch_size, seq_len = shape
+    depth = 1
+    sequence = tf.expand_dims(sequence, -1)
+  n_positions = tf_utils.get_shape_list(positions)[1]
+
+  shift = tf.expand_dims(seq_len * tf.range(batch_size), -1)
+  flat_positions = tf.reshape(positions + shift, [-1, 1])
+  flat_updates = tf.reshape(updates, [-1, depth])
+  updates = tf.scatter_nd(flat_positions, flat_updates,
+                          [batch_size * seq_len, depth])
+  updates = tf.reshape(updates, [batch_size, seq_len, depth])
+
+  flat_updates_mask = tf.ones([batch_size * n_positions], tf.int32)
+  updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask,
+                               [batch_size * seq_len])
+  updates_mask = tf.reshape(updates_mask, [batch_size, seq_len])
+  not_first_token = tf.concat([
+      tf.zeros((batch_size, 1), tf.int32),
+      tf.ones((batch_size, seq_len - 1), tf.int32)
+  ], -1)
+  updates_mask *= not_first_token
+  updates_mask_3d = tf.expand_dims(updates_mask, -1)
+
+  # account for duplicate positions
+  if sequence.dtype == tf.float32:
+    updates_mask_3d = tf.cast(updates_mask_3d, tf.float32)
+    updates /= tf.maximum(1.0, updates_mask_3d)
+  else:
+    assert sequence.dtype == tf.int32
+    updates = tf.math.floordiv(updates, tf.maximum(1, updates_mask_3d))
+  updates_mask = tf.minimum(updates_mask, 1)
+  updates_mask_3d = tf.minimum(updates_mask_3d, 1)
+
+  updated_sequence = (((1 - updates_mask_3d) * sequence) +
+                      (updates_mask_3d * updates))
+  if not depth_dimension:
+    updated_sequence = tf.squeeze(updated_sequence, -1)
+
+  return updated_sequence, updates_mask
+
+
+def sample_from_softmax(logits, disallow=None):
+  """Implement softmax sampling using gumbel softmax trick.
+
+  Args:
+    logits: A [batch_size, num_token_predictions, vocab_size] tensor indicating
+      the generator output logits for each masked position.
+    disallow: If `None`, we directly sample tokens from the logits. Otherwise,
+      this is a tensor of size [batch_size, num_token_predictions, vocab_size]
+      indicating the true word id in each masked position.
+
+  Returns:
+    sampled_tokens: A [batch_size, num_token_predictions, vocab_size] one hot
+      tensor indicating the sampled word id in each masked position.
+  """
+  if disallow is not None:
+    logits -= 1000.0 * disallow
+  uniform_noise = tf.random.uniform(
+      tf_utils.get_shape_list(logits), minval=0, maxval=1)
+  gumbel_noise = -tf.math.log(-tf.math.log(uniform_noise + 1e-9) + 1e-9)
+
+  # Here we essentially follow the original paper and use temperature 1.0 for
+  # generator output logits.
+  sampled_tokens = tf.one_hot(
+      tf.argmax(tf.nn.softmax(logits + gumbel_noise), -1, output_type=tf.int32),
+      logits.shape[-1])
+  return sampled_tokens
+
+
+def unmask(inputs, duplicate):
+  unmasked_input_word_ids, _ = scatter_update(inputs['input_word_ids'],
+                                              inputs['masked_lm_ids'],
+                                              inputs['masked_lm_positions'])
+  return get_updated_inputs(
+      inputs, duplicate, input_word_ids=unmasked_input_word_ids)
+
+
+def get_updated_inputs(inputs, duplicate, **kwargs):
+  if duplicate:
+    new_inputs = copy.copy(inputs)
+  else:
+    new_inputs = inputs
+  for k, v in kwargs.items():
+    new_inputs[k] = v
+  return new_inputs
--- a/official/nlp/modeling/models/electra_pretrainer_test.py
+++ b/official/nlp/modeling/models/electra_pretrainer_test.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ELECTRA pre trainer network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling import networks
+from official.nlp.modeling.models import electra_pretrainer
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class ElectraPretrainerTest(keras_parameterized.TestCase):
+
+  def test_electra_pretrainer(self):
+    """Validate that the Keras object can be created."""
+    # Build a transformer network to use within the ELECTRA trainer.
+    vocab_size = 100
+    sequence_length = 512
+    test_generator_network = networks.TransformerEncoder(
+        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)
+    test_discriminator_network = networks.TransformerEncoder(
+        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)
+
+    # Create a ELECTRA trainer with the created network.
+    num_classes = 3
+    num_token_predictions = 2
+    eletrca_trainer_model = electra_pretrainer.ElectraPretrainer(
+        generator_network=test_generator_network,
+        discriminator_network=test_discriminator_network,
+        vocab_size=vocab_size,
+        num_classes=num_classes,
+        sequence_length=sequence_length,
+        last_hidden_dim=768,
+        num_token_predictions=num_token_predictions,
+        disallow_correct=True)
+
+    # Create a set of 2-dimensional inputs (the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    lm_positions = tf.keras.Input(
+        shape=(num_token_predictions,), dtype=tf.int32)
+    lm_ids = tf.keras.Input(shape=(num_token_predictions,), dtype=tf.int32)
+    inputs = {
+        'input_word_ids': word_ids,
+        'input_mask': mask,
+        'input_type_ids': type_ids,
+        'masked_lm_positions': lm_positions,
+        'masked_lm_ids': lm_ids
+    }
+
+    # Invoke the trainer model on the inputs. This causes the layer to be built.
+    outputs = eletrca_trainer_model(inputs)
+    lm_outs = outputs['lm_outputs']
+    cls_outs = outputs['sentence_outputs']
+    disc_logits = outputs['disc_logits']
+    disc_label = outputs['disc_label']
+
+    # Validate that the outputs are of the expected shape.
+    expected_lm_shape = [None, num_token_predictions, vocab_size]
+    expected_classification_shape = [None, num_classes]
+    expected_disc_logits_shape = [None, sequence_length]
+    expected_disc_label_shape = [None, sequence_length]
+    self.assertAllEqual(expected_lm_shape, lm_outs.shape.as_list())
+    self.assertAllEqual(expected_classification_shape, cls_outs.shape.as_list())
+    self.assertAllEqual(expected_disc_logits_shape, disc_logits.shape.as_list())
+    self.assertAllEqual(expected_disc_label_shape, disc_label.shape.as_list())
+
+  def test_electra_trainer_tensor_call(self):
+    """Validate that the Keras object can be invoked."""
+    # Build a transformer network to use within the ELECTRA trainer. (Here, we
+    # use a short sequence_length for convenience.)
+    test_generator_network = networks.TransformerEncoder(
+        vocab_size=100, num_layers=4, sequence_length=3)
+    test_discriminator_network = networks.TransformerEncoder(
+        vocab_size=100, num_layers=4, sequence_length=3)
+
+    # Create a ELECTRA trainer with the created network.
+    eletrca_trainer_model = electra_pretrainer.ElectraPretrainer(
+        generator_network=test_generator_network,
+        discriminator_network=test_discriminator_network,
+        vocab_size=100,
+        num_classes=2,
+        sequence_length=3,
+        last_hidden_dim=768,
+        num_token_predictions=2)
+
+    # Create a set of 2-dimensional data tensors to feed into the model.
+    word_ids = tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.int32)
+    mask = tf.constant([[1, 1, 1], [1, 0, 0]], dtype=tf.int32)
+    type_ids = tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.int32)
+    lm_positions = tf.constant([[0, 1], [0, 2]], dtype=tf.int32)
+    lm_ids = tf.constant([[10, 20], [20, 30]], dtype=tf.int32)
+    inputs = {
+        'input_word_ids': word_ids,
+        'input_mask': mask,
+        'input_type_ids': type_ids,
+        'masked_lm_positions': lm_positions,
+        'masked_lm_ids': lm_ids
+    }
+
+    # Invoke the trainer model on the tensors. In Eager mode, this does the
+    # actual calculation. (We can't validate the outputs, since the network is
+    # too complex: this simply ensures we're not hitting runtime errors.)
+    _ = eletrca_trainer_model(inputs)
+
+  def test_serialize_deserialize(self):
+    """Validate that the ELECTRA trainer can be serialized and deserialized."""
+    # Build a transformer network to use within the BERT trainer. (Here, we use
+    # a short sequence_length for convenience.)
+    test_generator_network = networks.TransformerEncoder(
+        vocab_size=100, num_layers=4, sequence_length=3)
+    test_discriminator_network = networks.TransformerEncoder(
+        vocab_size=100, num_layers=4, sequence_length=3)
+
+    # Create a ELECTRA trainer with the created network. (Note that all the args
+    # are different, so we can catch any serialization mismatches.)
+    electra_trainer_model = electra_pretrainer.ElectraPretrainer(
+        generator_network=test_generator_network,
+        discriminator_network=test_discriminator_network,
+        vocab_size=100,
+        num_classes=2,
+        sequence_length=3,
+        last_hidden_dim=768,
+        num_token_predictions=2)
+
+    # Create another BERT trainer via serialization and deserialization.
+    config = electra_trainer_model.get_config()
+    new_electra_trainer_model = electra_pretrainer.ElectraPretrainer.from_config(
+        config)
+
+    # Validate that the config can be forced to JSON.
+    _ = new_electra_trainer_model.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(electra_trainer_model.get_config(),
+                        new_electra_trainer_model.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/nlp/modeling/networks/README.md
+++ b/official/nlp/modeling/networks/README.md
@@ -16,8 +16,6 @@ Self-supervised Learning of Language Representations]
 (https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805), ALBERT refactorizes embedding parameters
 into two smaller matrices and shares parameters across layers.

-* [`MaskedLM`](masked_lm.py) implements a masked language model for BERT pretraining. It assumes that the network being passed has a `get_embedding_table()` method.
-
 * [`Classification`](classification.py) contains a single hidden layer, and is
 intended for use as a classification or regression (if number of classes is set
 to 1) head.

--- a/official/nlp/modeling/networks/__init__.py
+++ b/official/nlp/modeling/networks/__init__.py
@@ -16,7 +16,6 @@
 from official.nlp.modeling.networks.albert_transformer_encoder import AlbertTransformerEncoder
 from official.nlp.modeling.networks.classification import Classification
 from official.nlp.modeling.networks.encoder_scaffold import EncoderScaffold
-from official.nlp.modeling.networks.masked_lm import MaskedLM
 from official.nlp.modeling.networks.span_labeling import SpanLabeling
 from official.nlp.modeling.networks.token_classification import TokenClassification
 from official.nlp.modeling.networks.transformer_encoder import TransformerEncoder
--- a/official/nlp/modeling/networks/transformer_encoder.py
+++ b/official/nlp/modeling/networks/transformer_encoder.py
@@ -60,7 +60,7 @@ class TransformerEncoder(tf.keras.Model):
    initializer: The initialzer to use for all weights in this encoder.
    return_all_encoder_outputs: Whether to output sequence embedding outputs of
      all encoder transformer layers.
-    output_range: the sequence output range, [0, output_range), by slicing the
+    output_range: The sequence output range, [0, output_range), by slicing the
      target sequence of the last transformer layer. `None` means the entire
      target sequence will attend to the source sequence, which yeilds the full
      output.
@@ -69,6 +69,10 @@ class TransformerEncoder(tf.keras.Model):
      two matrices in the shape of ['vocab_size', 'embedding_width'] and
      ['embedding_width', 'hidden_size'] ('embedding_width' is usually much
      smaller than 'hidden_size').
+    embedding_layer: The word embedding layer. `None` means we will create a new
+      embedding layer. Otherwise, we will reuse the given embedding layer. This
+      parameter is originally added for ELECTRA model which needs to tie the
+      generator embeddings with the discriminator embeddings.
  """

  def __init__(self,
@@ -87,6 +91,7 @@ class TransformerEncoder(tf.keras.Model):
               return_all_encoder_outputs=False,
               output_range=None,
               embedding_width=None,
+               embedding_layer=None,
               **kwargs):
    activation = tf.keras.activations.get(activation)
    initializer = tf.keras.initializers.get(initializer)
@@ -121,11 +126,14 @@ class TransformerEncoder(tf.keras.Model):

    if embedding_width is None:
      embedding_width = hidden_size
-    self._embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=embedding_width,
-        initializer=initializer,
-        name='word_embeddings')
+    if embedding_layer is None:
+      self._embedding_layer = layers.OnDeviceEmbedding(
+          vocab_size=vocab_size,
+          embedding_width=embedding_width,
+          initializer=initializer,
+          name='word_embeddings')
+    else:
+      self._embedding_layer = embedding_layer
    word_embeddings = self._embedding_layer(word_ids)

    # Always uses dynamic slicing for simplicity.
@@ -209,6 +217,9 @@ class TransformerEncoder(tf.keras.Model):
  def get_embedding_table(self):
    return self._embedding_layer.embeddings

+  def get_embedding_layer(self):
+    return self._embedding_layer
+
  def get_config(self):
    return self._config_dict


--- a/official/nlp/nhnet/decoder.py
+++ b/official/nlp/nhnet/decoder.py
@@ -22,151 +22,10 @@ from __future__ import print_function
 import tensorflow as tf
 from official.modeling import tf_utils
 from official.nlp.modeling import layers
-from official.nlp.nhnet import multi_channel_attention
+from official.nlp.modeling.layers import transformer
 from official.nlp.transformer import model_utils as transformer_utils


-class TransformerDecoderBlock(tf.keras.layers.Layer):
-  """Single transformer layer for decoder.
-
-  It has three sub-layers:
-  (1) a multi-head self-attention mechanism.
-  (2) a encoder-decoder attention.
-  (3) a positionwise fully connected feed-forward network.
-  """
-
-  def __init__(self,
-               hidden_size=768,
-               num_attention_heads=12,
-               intermediate_size=3072,
-               intermediate_activation="gelu",
-               hidden_dropout_prob=0.0,
-               attention_probs_dropout_prob=0.0,
-               initializer_range=0.02,
-               multi_channel_cross_attention=False,
-               **kwargs):
-    super(TransformerDecoderBlock, self).__init__(**kwargs)
-    self.hidden_size = hidden_size
-    self.num_attention_heads = num_attention_heads
-    self.intermediate_size = intermediate_size
-    self.intermediate_activation = tf_utils.get_activation(
-        intermediate_activation)
-    self.hidden_dropout_prob = hidden_dropout_prob
-    self.attention_probs_dropout_prob = attention_probs_dropout_prob
-    self.multi_channel_cross_attention = multi_channel_cross_attention
-    self._kernel_initializer = tf.keras.initializers.TruncatedNormal(
-        stddev=initializer_range)
-    self._bias_initializer = tf.keras.initializers.get("zeros")
-    if self.multi_channel_cross_attention:
-      self._cross_attention_cls = multi_channel_attention.MultiChannelAttention
-    else:
-      self._cross_attention_cls = layers.MultiHeadAttention
-
-    if self.hidden_size % self.num_attention_heads != 0:
-      raise ValueError(
-          "The hidden size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (self.hidden_size, self.num_attention_heads))
-    self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
-
-  def build(self, input_shape):
-    # Self attention.
-    self.self_attention = layers.CachedAttention(
-        num_heads=self.num_attention_heads,
-        key_size=self.attention_head_size,
-        dropout=self.attention_probs_dropout_prob,
-        kernel_initializer=self._kernel_initializer,
-        name="self_attention")
-    self.self_attention_output_dense = layers.DenseEinsum(
-        output_shape=self.hidden_size,
-        num_summed_dimensions=2,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        name="self_attention_output")
-    self.self_attention_dropout = tf.keras.layers.Dropout(
-        rate=self.hidden_dropout_prob)
-    self.self_attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="self_attention_layer_norm", axis=-1, epsilon=1e-12))
-    # Encoder-decoder attention.
-    self.encdec_attention = self._cross_attention_cls(
-        num_heads=self.num_attention_heads,
-        key_size=self.attention_head_size,
-        dropout=self.attention_probs_dropout_prob,
-        output_shape=self.hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        name="attention/encdec")
-
-    self.encdec_attention_dropout = tf.keras.layers.Dropout(
-        rate=self.hidden_dropout_prob)
-    self.encdec_attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12))
-
-    # Feed-forward projection.
-    self.intermediate_dense = layers.DenseEinsum(
-        output_shape=self.intermediate_size,
-        activation=None,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        name="intermediate")
-    self.intermediate_activation_layer = tf.keras.layers.Activation(
-        self.intermediate_activation)
-    self.output_dense = layers.DenseEinsum(
-        output_shape=self.hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        name="output")
-    self.output_dropout = tf.keras.layers.Dropout(rate=self.hidden_dropout_prob)
-    self.output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm", axis=-1, epsilon=1e-12)
-    super(TransformerDecoderBlock, self).build(input_shape)
-
-  def common_layers_with_encoder(self):
-    """Gets layer objects that can make a Transformer encoder block."""
-    return [
-        self.self_attention, self.self_attention_layer_norm,
-        self.intermediate_dense, self.output_dense, self.output_layer_norm
-    ]
-
-  def call(self, inputs, cache=None, decode_loop_step=None):
-    if self.multi_channel_cross_attention:
-      if len(inputs) != 5:
-        raise ValueError(
-            "TransformerDecoderBlock must have 5 inputs, when it uses "
-            "multi_channel_cross_attention. But it got: %d" % len(inputs))
-    elif len(inputs) != 4:
-      raise ValueError(
-          "TransformerDecoderBlock must have 4 inputs, but it got: %d" %
-          len(inputs))
-    input_tensor, memory, attention_mask, self_attention_mask = inputs[:4]
-    self_attention_inputs = [input_tensor, input_tensor]
-    self_attention_output, cache = self.self_attention(
-        self_attention_inputs,
-        attention_mask=self_attention_mask,
-        cache=cache,
-        decode_loop_step=decode_loop_step)
-    self_attention_output = self.self_attention_dropout(self_attention_output)
-    self_attention_output = self.self_attention_layer_norm(
-        input_tensor + self_attention_output)
-
-    cross_attn_inputs = [self_attention_output, memory]
-    if self.multi_channel_cross_attention:
-      # Accesses the 5-th input tensor for the doc-attention probabilities.
-      cross_attn_inputs.append(inputs[-1])
-    attention_output = self.encdec_attention(cross_attn_inputs, attention_mask)
-    attention_output = self.encdec_attention_dropout(attention_output)
-    attention_output = self.encdec_attention_layer_norm(self_attention_output +
-                                                        attention_output)
-
-    intermediate_output = self.intermediate_dense(attention_output)
-    intermediate_output = self.intermediate_activation_layer(
-        intermediate_output)
-    layer_output = self.output_dense(intermediate_output)
-    layer_output = self.output_dropout(layer_output)
-    layer_output = self.output_layer_norm(layer_output + attention_output)
-    return layer_output, cache
-
-
 class TransformerDecoder(tf.keras.layers.Layer):
  """Transformer decoder stack."""

@@ -200,14 +59,14 @@ class TransformerDecoder(tf.keras.layers.Layer):
    self.layers = []
    for i in range(self.num_hidden_layers):
      self.layers.append(
-          TransformerDecoderBlock(
-              hidden_size=self.hidden_size,
+          transformer.TransformerDecoderLayer(
              num_attention_heads=self.num_attention_heads,
              intermediate_size=self.intermediate_size,
              intermediate_activation=self.intermediate_activation,
-              hidden_dropout_prob=self.hidden_dropout_prob,
-              attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-              initializer_range=self.initializer_range,
+              dropout_rate=self.hidden_dropout_prob,
+              attention_dropout_rate=self.attention_probs_dropout_prob,
+              kernel_initializer=tf.keras.initializers.TruncatedNormal(
+                  stddev=self.initializer_range),
              multi_channel_cross_attention=self.multi_channel_cross_attention,
              name=("layer_%d" % i)))
    super(TransformerDecoder, self).build(unused_input_shapes)

--- a/official/nlp/nhnet/decoder_test.py
+++ b/official/nlp/nhnet/decoder_test.py
@@ -26,17 +26,6 @@ from official.nlp.nhnet import decoder
 from official.nlp.nhnet import utils


-def _create_cache(batch_size, init_decode_length, num_heads, head_size):
-  return {
-      "key":
-          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
-                   dtype=tf.float32),
-      "value":
-          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
-                   dtype=tf.float32)
-  }
-
-
 class DecoderTest(tf.test.TestCase):

  def setUp(self):
@@ -56,26 +45,6 @@ class DecoderTest(tf.test.TestCase):
    decoder_block.build(None)
    self.assertEqual(len(decoder_block.layers), self._config.num_hidden_layers)

-  def test_decoder_block_with_cache(self):
-    decoder_block = decoder.TransformerDecoderBlock(
-        hidden_size=self._config.hidden_size,
-        num_attention_heads=self._config.num_attention_heads,
-        intermediate_size=self._config.intermediate_size,
-        intermediate_activation=self._config.hidden_act,
-        hidden_dropout_prob=self._config.hidden_dropout_prob,
-        attention_probs_dropout_prob=self._config.attention_probs_dropout_prob,
-        initializer_range=self._config.initializer_range)
-    # Forward path.
-    dummy_tensor = tf.zeros([2, 4, self._config.hidden_size], dtype=tf.float32)
-    dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
-    inputs = [dummy_tensor, dummy_tensor, dummy_mask, dummy_mask]
-    cache = _create_cache(
-        2, 0, self._config.num_attention_heads,
-        self._config.hidden_size // self._config.num_attention_heads)
-    output, cache = decoder_block(inputs, cache)
-    self.assertEqual(output.shape, (2, 4, self._config.hidden_size))
-    self.assertEqual(cache["value"].shape, (2, 4, 2, 8))
-
  def test_bert_decoder(self):
    seq_length = 10
    encoder_input_ids = tf.keras.layers.Input(

--- a/official/nlp/nhnet/models.py
+++ b/official/nlp/nhnet/models.py
@@ -27,9 +27,9 @@ from typing import Optional, Text
 from official.modeling import tf_utils
 from official.modeling.hyperparams import params_dict
 from official.nlp.modeling import networks
+from official.nlp.modeling.layers import multi_channel_attention
 from official.nlp.nhnet import configs
 from official.nlp.nhnet import decoder
-from official.nlp.nhnet import multi_channel_attention
 from official.nlp.nhnet import utils
 from official.nlp.transformer import beam_search

@@ -273,7 +273,7 @@ class NHNet(Bert2Bert):

  def __init__(self, params, bert_layer, decoder_layer, name=None):
    super(NHNet, self).__init__(params, bert_layer, decoder_layer, name=name)
-    self.doc_attention = multi_channel_attention.DocAttention(
+    self.doc_attention = multi_channel_attention.VotingAttention(
        num_heads=params.num_decoder_attn_heads,
        head_size=params.hidden_size // params.num_decoder_attn_heads)


--- a/official/nlp/tasks/masked_lm.py
+++ b/official/nlp/tasks/masked_lm.py
@@ -21,13 +21,12 @@ from official.core import base_task
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import bert
 from official.nlp.data import pretrain_dataloader
-from official.nlp.modeling import losses as loss_lib


 @dataclasses.dataclass
 class MaskedLMConfig(cfg.TaskConfig):
  """The model config."""
-  network: bert.BertPretrainerConfig = bert.BertPretrainerConfig(cls_heads=[
+  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(cls_heads=[
      bert.ClsHeadConfig(
          inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence')
  ])
@@ -40,31 +39,31 @@ class MaskedLMTask(base_task.Task):
  """Mock task object for testing."""

  def build_model(self):
-    return bert.instantiate_from_cfg(self.task_config.network)
+    return bert.instantiate_bertpretrainer_from_cfg(self.task_config.model)

  def build_losses(self,
-                   features,
+                   labels,
                   model_outputs,
                   metrics,
                   aux_losses=None) -> tf.Tensor:
    metrics = dict([(metric.name, metric) for metric in metrics])
-    lm_output = tf.nn.log_softmax(model_outputs['lm_output'], axis=-1)
-    mlm_loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
-        labels=features['masked_lm_ids'],
-        predictions=lm_output,
-        weights=features['masked_lm_weights'])
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        labels['masked_lm_ids'],
+        tf.cast(model_outputs['lm_output'], tf.float32),
+        from_logits=True)
+    lm_label_weights = labels['masked_lm_weights']
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
    metrics['lm_example_loss'].update_state(mlm_loss)
-    if 'next_sentence_labels' in features:
-      policy = tf.keras.mixed_precision.experimental.global_policy()
-      if policy.name == 'mixed_bfloat16':  # b/158514794: bf16 is not stable.
-        policy = tf.float32
-      predictions = tf.keras.layers.Activation(
-          tf.nn.log_softmax, dtype=policy)(model_outputs['next_sentence'])
-
-      sentence_labels = features['next_sentence_labels']
-      sentence_loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
-          labels=sentence_labels,
-          predictions=predictions)
+    if 'next_sentence_labels' in labels:
+      sentence_labels = labels['next_sentence_labels']
+      sentence_outputs = tf.cast(
+          model_outputs['next_sentence'], dtype=tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels,
+          sentence_outputs,
+          from_logits=True)
      metrics['next_sentence_loss'].update_state(sentence_loss)
      total_loss = mlm_loss + sentence_loss
    else:
@@ -77,6 +76,7 @@ class MaskedLMTask(base_task.Task):
  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for pretraining."""
    if params.input_path == 'dummy':
+
      def dummy_data(_):
        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
@@ -112,15 +112,15 @@ class MaskedLMTask(base_task.Task):
      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
    return metrics

-  def process_metrics(self, metrics, inputs, outputs):
+  def process_metrics(self, metrics, labels, model_outputs):
    metrics = dict([(metric.name, metric) for metric in metrics])
    if 'masked_lm_accuracy' in metrics:
-      metrics['masked_lm_accuracy'].update_state(inputs['masked_lm_ids'],
-                                                 outputs['lm_output'],
-                                                 inputs['masked_lm_weights'])
+      metrics['masked_lm_accuracy'].update_state(labels['masked_lm_ids'],
+                                                 model_outputs['lm_output'],
+                                                 labels['masked_lm_weights'])
    if 'next_sentence_accuracy' in metrics:
      metrics['next_sentence_accuracy'].update_state(
-          inputs['next_sentence_labels'], outputs['next_sentence'])
+          labels['next_sentence_labels'], model_outputs['next_sentence'])

  def train_step(self, inputs, model: tf.keras.Model,
                 optimizer: tf.keras.optimizers.Optimizer, metrics):
@@ -139,7 +139,7 @@ class MaskedLMTask(base_task.Task):
      outputs = model(inputs, training=True)
      # Computes per-replica loss.
      loss = self.build_losses(
-          features=inputs,
+          labels=inputs,
          model_outputs=outputs,
          metrics=metrics,
          aux_losses=model.losses)
@@ -166,7 +166,7 @@ class MaskedLMTask(base_task.Task):
    """
    outputs = self.inference_step(inputs, model)
    loss = self.build_losses(
-        features=inputs,
+        labels=inputs,
        model_outputs=outputs,
        metrics=metrics,
        aux_losses=model.losses)

--- a/official/nlp/tasks/masked_lm_test.py
+++ b/official/nlp/tasks/masked_lm_test.py
@@ -26,7 +26,7 @@ class MLMTaskTest(tf.test.TestCase):

  def test_task(self):
    config = masked_lm.MaskedLMConfig(
-        network=bert.BertPretrainerConfig(
+        model=bert.BertPretrainerConfig(
            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
            num_masked_tokens=20,
            cls_heads=[

--- a/official/nlp/tasks/question_answering.py
+++ b/official/nlp/tasks/question_answering.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Question answering task."""
+import collections
+import json
+import os
+from absl import logging
+import dataclasses
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from official.core import base_task
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.bert import input_pipeline
+from official.nlp.bert import squad_evaluate_v1_1
+from official.nlp.bert import squad_evaluate_v2_0
+from official.nlp.bert import tokenization
+from official.nlp.configs import encoders
+from official.nlp.data import squad_lib as squad_lib_wp
+from official.nlp.data import squad_lib_sp
+from official.nlp.modeling import models
+from official.nlp.tasks import utils
+
+
+@dataclasses.dataclass
+class QuestionAnsweringConfig(cfg.TaskConfig):
+  """The model config."""
+  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
+  init_checkpoint: str = ''
+  hub_module_url: str = ''
+  n_best_size: int = 20
+  max_answer_length: int = 30
+  null_score_diff_threshold: float = 0.0
+  model: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+
+
+@base_task.register_task_cls(QuestionAnsweringConfig)
+class QuestionAnsweringTask(base_task.Task):
+  """Task object for question answering."""
+
+  def __init__(self, params=cfg.TaskConfig):
+    super(QuestionAnsweringTask, self).__init__(params)
+    if params.hub_module_url and params.init_checkpoint:
+      raise ValueError('At most one of `hub_module_url` and '
+                       '`init_checkpoint` can be specified.')
+    if params.hub_module_url:
+      self._hub_module = hub.load(params.hub_module_url)
+    else:
+      self._hub_module = None
+
+    if params.validation_data.tokenization == 'WordPiece':
+      self.squad_lib = squad_lib_wp
+    elif params.validation_data.tokenization == 'SentencePiece':
+      self.squad_lib = squad_lib_sp
+    else:
+      raise ValueError('Unsupported tokenization method: {}'.format(
+          params.validation_data.tokenization))
+
+  def build_model(self):
+    if self._hub_module:
+      encoder_network = utils.get_encoder_from_hub(self._hub_module)
+    else:
+      encoder_network = encoders.instantiate_encoder_from_cfg(
+          self.task_config.model)
+
+    return models.BertSpanLabeler(
+        network=encoder_network,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=self.task_config.model.initializer_range))
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    start_positions = labels['start_positions']
+    end_positions = labels['end_positions']
+    start_logits, end_logits = model_outputs
+
+    start_loss = tf.keras.losses.sparse_categorical_crossentropy(
+        start_positions,
+        tf.cast(start_logits, dtype=tf.float32),
+        from_logits=True)
+    end_loss = tf.keras.losses.sparse_categorical_crossentropy(
+        end_positions,
+        tf.cast(end_logits, dtype=tf.float32),
+        from_logits=True)
+
+    loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
+    return loss
+
+  def _preprocess_eval_data(self, params):
+    eval_examples = self.squad_lib.read_squad_examples(
+        input_file=params.input_path,
+        is_training=False,
+        version_2_with_negative=params.version_2_with_negative)
+
+    temp_file_path = params.input_preprocessed_data_path or '/tmp'
+    eval_writer = self.squad_lib.FeatureWriter(
+        filename=os.path.join(temp_file_path, 'eval.tf_record'),
+        is_training=False)
+    eval_features = []
+
+    def _append_feature(feature, is_padding):
+      if not is_padding:
+        eval_features.append(feature)
+      eval_writer.process_feature(feature)
+
+    kwargs = dict(
+        examples=eval_examples,
+        tokenizer=tokenization.FullTokenizer(
+            vocab_file=params.vocab_file,
+            do_lower_case=params.do_lower_case),
+        max_seq_length=params.seq_length,
+        doc_stride=params.doc_stride,
+        max_query_length=params.query_length,
+        is_training=False,
+        output_fn=_append_feature,
+        batch_size=params.global_batch_size)
+    if params.tokenization == 'SentencePiece':
+      # squad_lib_sp requires one more argument 'do_lower_case'.
+      kwargs['do_lower_case'] = params.do_lower_case
+
+    eval_dataset_size = self.squad_lib.convert_examples_to_features(**kwargs)
+    eval_writer.close()
+
+    logging.info('***** Evaluation input stats *****')
+    logging.info('  Num orig examples = %d', len(eval_examples))
+    logging.info('  Num split examples = %d', len(eval_features))
+    logging.info('  Batch size = %d', params.global_batch_size)
+    logging.info('  Dataset size = %d', eval_dataset_size)
+
+    return eval_writer.filename, eval_examples, eval_features
+
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for sentence_prediction task."""
+    if params.input_path == 'dummy':
+      # Dummy training data for unit test.
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        x = dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids)
+        y = dict(
+            start_positions=tf.constant(0, dtype=tf.int32),
+            end_positions=tf.constant(1, dtype=tf.int32))
+        return (x, y)
+
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    if params.is_training:
+      input_path = params.input_path
+    else:
+      input_path, self._eval_examples, self._eval_features = (
+          self._preprocess_eval_data(params))
+
+    batch_size = input_context.get_per_replica_batch_size(
+        params.global_batch_size) if input_context else params.global_batch_size
+    # TODO(chendouble): add and use nlp.data.question_answering_dataloader.
+    dataset = input_pipeline.create_squad_dataset(
+        input_path,
+        params.seq_length,
+        batch_size,
+        is_training=params.is_training,
+        input_pipeline_context=input_context)
+    return dataset
+
+  def build_metrics(self, training=None):
+    del training
+    # TODO(lehou): a list of metrics doesn't work the same as in compile/fit.
+    metrics = [
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='start_position_accuracy'),
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='end_position_accuracy'),
+    ]
+    return metrics
+
+  def process_metrics(self, metrics, labels, model_outputs):
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    start_logits, end_logits = model_outputs
+    metrics['start_position_accuracy'].update_state(
+        labels['start_positions'], start_logits)
+    metrics['end_position_accuracy'].update_state(
+        labels['end_positions'], end_logits)
+
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    start_logits, end_logits = model_outputs
+    compiled_metrics.update_state(
+        y_true=labels,  # labels has keys 'start_positions' and 'end_positions'.
+        y_pred={'start_positions': start_logits, 'end_positions': end_logits})
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    features, _ = inputs
+    unique_ids = features.pop('unique_ids')
+    model_outputs = self.inference_step(features, model)
+    start_logits, end_logits = model_outputs
+    logs = {
+        self.loss: 0.0,  # TODO(lehou): compute the real validation loss.
+        'unique_ids': unique_ids,
+        'start_logits': start_logits,
+        'end_logits': end_logits,
+    }
+    return logs
+
+  raw_aggregated_result = collections.namedtuple(
+      'RawResult', ['unique_id', 'start_logits', 'end_logits'])
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    assert step_outputs is not None, 'Got no logs from self.validation_step.'
+    if state is None:
+      state = []
+
+    for unique_ids, start_logits, end_logits in zip(
+        step_outputs['unique_ids'],
+        step_outputs['start_logits'],
+        step_outputs['end_logits']):
+      u_ids, s_logits, e_logits = (
+          unique_ids.numpy(), start_logits.numpy(), end_logits.numpy())
+      if u_ids.size == 1:
+        u_ids = [u_ids]
+        s_logits = [s_logits]
+        e_logits = [e_logits]
+      for values in zip(u_ids, s_logits, e_logits):
+        state.append(self.raw_aggregated_result(
+            unique_id=values[0],
+            start_logits=values[1].tolist(),
+            end_logits=values[2].tolist()))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs):
+    all_predictions, _, scores_diff = (
+        self.squad_lib.postprocess_output(
+            self._eval_examples,
+            self._eval_features,
+            aggregated_logs,
+            self.task_config.n_best_size,
+            self.task_config.max_answer_length,
+            self.task_config.validation_data.do_lower_case,
+            version_2_with_negative=(
+                self.task_config.validation_data.version_2_with_negative),
+            null_score_diff_threshold=(
+                self.task_config.null_score_diff_threshold),
+            verbose=False))
+
+    with tf.io.gfile.GFile(
+        self.task_config.validation_data.input_path, 'r') as reader:
+      dataset_json = json.load(reader)
+      pred_dataset = dataset_json['data']
+    if self.task_config.validation_data.version_2_with_negative:
+      eval_metrics = squad_evaluate_v2_0.evaluate(
+          pred_dataset, all_predictions, scores_diff)
+    else:
+      eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
+    return eval_metrics
+
+  def initialize(self, model):
+    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    status = ckpt.restore(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info('finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
--- a/official/nlp/tasks/question_answering_test.py
+++ b/official/nlp/tasks/question_answering_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.nlp.tasks.question_answering."""
+import itertools
+import json
+import os
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.nlp.bert import configs
+from official.nlp.bert import export_tfhub
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.tasks import question_answering
+
+
+class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(QuestionAnsweringTaskTest, self).setUp()
+    self._encoder_config = encoders.TransformerEncoderConfig(
+        vocab_size=30522, num_layers=1)
+    self._train_data_config = bert.QADataConfig(
+        input_path="dummy",
+        seq_length=128,
+        global_batch_size=1)
+
+    val_data = {"version": "1.1",
+                "data": [{"paragraphs": [
+                    {"context": "Sky is blue.",
+                     "qas": [{"question": "What is blue?", "id": "1234",
+                              "answers": [{"text": "Sky", "answer_start": 0},
+                                          {"text": "Sky", "answer_start": 0},
+                                          {"text": "Sky", "answer_start": 0}]
+                              }]}]}]}
+    self._val_input_path = os.path.join(self.get_temp_dir(), "val_data.json")
+    with tf.io.gfile.GFile(self._val_input_path, "w") as writer:
+      writer.write(json.dumps(val_data, indent=4) + "\n")
+
+    self._test_vocab = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with tf.io.gfile.GFile(self._test_vocab, "w") as writer:
+      writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")
+
+  def _get_validation_data_config(self, version_2_with_negative=False):
+    return bert.QADevDataConfig(
+        input_path=self._val_input_path,
+        input_preprocessed_data_path=self.get_temp_dir(),
+        seq_length=128,
+        global_batch_size=1,
+        version_2_with_negative=version_2_with_negative,
+        vocab_file=self._test_vocab,
+        tokenization="WordPiece",
+        do_lower_case=True)
+
+  def _run_task(self, config):
+    task = question_answering.QuestionAnsweringTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    task.initialize(model)
+
+    train_dataset = task.build_inputs(config.train_data)
+    train_iterator = iter(train_dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(train_iterator), model, optimizer, metrics=metrics)
+
+    val_dataset = task.build_inputs(config.validation_data)
+    val_iterator = iter(val_dataset)
+    logs = task.validation_step(next(val_iterator), model, metrics=metrics)
+    logs = task.aggregate_logs(step_outputs=logs)
+    metrics = task.reduce_aggregated_logs(logs)
+    self.assertIn("final_f1", metrics)
+
+  @parameterized.parameters(itertools.product(
+      (False, True),
+      ("WordPiece", "SentencePiece"),
+  ))
+  def test_task(self, version_2_with_negative, tokenization):
+    # Saves a checkpoint.
+    pretrain_cfg = bert.BertPretrainerConfig(
+        encoder=self._encoder_config,
+        num_masked_tokens=20,
+        cls_heads=[
+            bert.ClsHeadConfig(
+                inner_dim=10, num_classes=3, name="next_sentence")
+        ])
+    pretrain_model = bert.instantiate_bertpretrainer_from_cfg(pretrain_cfg)
+    ckpt = tf.train.Checkpoint(
+        model=pretrain_model, **pretrain_model.checkpoint_items)
+    saved_path = ckpt.save(self.get_temp_dir())
+
+    config = question_answering.QuestionAnsweringConfig(
+        init_checkpoint=saved_path,
+        model=self._encoder_config,
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config(
+            version_2_with_negative))
+    self._run_task(config)
+
+  def test_task_with_fit(self):
+    config = question_answering.QuestionAnsweringConfig(
+        model=self._encoder_config,
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config())
+    task = question_answering.QuestionAnsweringTask(config)
+    model = task.build_model()
+    model = task.compile_model(
+        model,
+        optimizer=tf.keras.optimizers.SGD(lr=0.1),
+        train_step=task.train_step,
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
+    dataset = task.build_inputs(config.train_data)
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn("loss", logs.history)
+    self.assertIn("start_positions_accuracy", logs.history)
+    self.assertIn("end_positions_accuracy", logs.history)
+
+  def _export_bert_tfhub(self):
+    bert_config = configs.BertConfig(
+        vocab_size=30522,
+        hidden_size=16,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=1)
+    _, encoder = export_tfhub.create_bert_model(bert_config)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(model=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+
+    vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt")
+    with tf.io.gfile.GFile(vocab_file, "w") as f:
+      f.write("dummy content")
+
+    hub_destination = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path,
+                                   hub_destination, vocab_file)
+    return hub_destination
+
+  def test_task_with_hub(self):
+    hub_module_url = self._export_bert_tfhub()
+    config = question_answering.QuestionAnsweringConfig(
+        hub_module_url=hub_module_url,
+        model=self._encoder_config,
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config())
+    self._run_task(config)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -14,8 +14,11 @@
 # limitations under the License.
 # ==============================================================================
 """Sentence prediction (classification) task."""
-import logging
+from absl import logging
 import dataclasses
+import numpy as np
+from scipy import stats
+from sklearn import metrics as sklearn_metrics
 import tensorflow as tf
 import tensorflow_hub as hub

@@ -23,18 +26,19 @@ from official.core import base_task
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import bert
 from official.nlp.data import sentence_prediction_dataloader
-from official.nlp.modeling import losses as loss_lib
+from official.nlp.tasks import utils


 @dataclasses.dataclass
 class SentencePredictionConfig(cfg.TaskConfig):
  """The model config."""
-  # At most one of `pretrain_checkpoint_dir` and `hub_module_url` can
+  # At most one of `init_checkpoint` and `hub_module_url` can
  # be specified.
-  pretrain_checkpoint_dir: str = ''
+  init_checkpoint: str = ''
  hub_module_url: str = ''
-  network: bert.BertPretrainerConfig = bert.BertPretrainerConfig(
-      num_masked_tokens=0,
+  metric_type: str = 'accuracy'
+  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(
+      num_masked_tokens=0,  # No masked language modeling head.
      cls_heads=[
          bert.ClsHeadConfig(
              inner_dim=768,
@@ -52,39 +56,28 @@ class SentencePredictionTask(base_task.Task):

  def __init__(self, params=cfg.TaskConfig):
    super(SentencePredictionTask, self).__init__(params)
-    if params.hub_module_url and params.pretrain_checkpoint_dir:
+    if params.hub_module_url and params.init_checkpoint:
      raise ValueError('At most one of `hub_module_url` and '
                       '`pretrain_checkpoint_dir` can be specified.')
    if params.hub_module_url:
      self._hub_module = hub.load(params.hub_module_url)
    else:
      self._hub_module = None
+    self.metric_type = params.metric_type

  def build_model(self):
    if self._hub_module:
-      input_word_ids = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_word_ids')
-      input_mask = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_mask')
-      input_type_ids = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='input_type_ids')
-      bert_model = hub.KerasLayer(self._hub_module, trainable=True)
-      pooled_output, sequence_output = bert_model(
-          [input_word_ids, input_mask, input_type_ids])
-      encoder_from_hub = tf.keras.Model(
-          inputs=[input_word_ids, input_mask, input_type_ids],
-          outputs=[sequence_output, pooled_output])
-      return bert.instantiate_from_cfg(
-          self.task_config.network, encoder_network=encoder_from_hub)
+      encoder_from_hub = utils.get_encoder_from_hub(self._hub_module)
+      return bert.instantiate_bertpretrainer_from_cfg(
+          self.task_config.model, encoder_network=encoder_from_hub)
    else:
-      return bert.instantiate_from_cfg(self.task_config.network)
+      return bert.instantiate_bertpretrainer_from_cfg(self.task_config.model)

-  def build_losses(self, features, model_outputs, aux_losses=None) -> tf.Tensor:
-    labels = features
-    loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
-        labels=labels,
-        predictions=tf.nn.log_softmax(model_outputs['sentence_prediction'],
-                                      axis=-1))
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    loss = tf.keras.losses.sparse_categorical_crossentropy(
+        labels,
+        tf.cast(model_outputs['sentence_prediction'], tf.float32),
+        from_logits=True)

    if aux_losses:
      loss += tf.add_n(aux_losses)
@@ -93,13 +86,14 @@ class SentencePredictionTask(base_task.Task):
  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    if params.input_path == 'dummy':
+
      def dummy_data(_):
        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
        x = dict(
            input_word_ids=dummy_ids,
            input_mask=dummy_ids,
            input_type_ids=dummy_ids)
-        y = tf.ones((1, 1), dtype=tf.int32)
+        y = tf.zeros((1, 1), dtype=tf.int32)
        return (x, y)

      dataset = tf.data.Dataset.range(1)
@@ -113,22 +107,74 @@ class SentencePredictionTask(base_task.Task):

  def build_metrics(self, training=None):
    del training
-    metrics = [
-        tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')
-    ]
+    metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
    return metrics

-  def process_metrics(self, metrics, labels, outputs):
+  def process_metrics(self, metrics, labels, model_outputs):
    for metric in metrics:
-      metric.update_state(labels, outputs['sentence_prediction'])
-
-  def process_compiled_metrics(self, compiled_metrics, labels, outputs):
-    compiled_metrics.update_state(labels, outputs['sentence_prediction'])
+      metric.update_state(labels, model_outputs['sentence_prediction'])
+
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    compiled_metrics.update_state(labels, model_outputs['sentence_prediction'])
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    if self.metric_type == 'accuracy':
+      return super(SentencePredictionTask,
+                   self).validation_step(inputs, model, metrics)
+    features, labels = inputs
+    outputs = self.inference_step(features, model)
+    loss = self.build_losses(
+        labels=labels, model_outputs=outputs, aux_losses=model.losses)
+    logs = {self.loss: loss}
+    if self.metric_type == 'matthews_corrcoef':
+      logs.update({
+          'sentence_prediction':
+              tf.expand_dims(
+                  tf.math.argmax(outputs['sentence_prediction'], axis=1),
+                  axis=0),
+          'labels':
+              labels,
+      })
+    if self.metric_type == 'pearson_spearman_corr':
+      logs.update({
+          'sentence_prediction': outputs['sentence_prediction'],
+          'labels': labels,
+      })
+    return logs
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    if self.metric_type == 'accuracy':
+      return None
+    if state is None:
+      state = {'sentence_prediction': [], 'labels': []}
+    state['sentence_prediction'].append(
+        np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']],
+                       axis=0))
+    state['labels'].append(
+        np.concatenate([v.numpy() for v in step_outputs['labels']], axis=0))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs):
+    if self.metric_type == 'matthews_corrcoef':
+      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      return {
+          self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels)
+      }
+    if self.metric_type == 'pearson_spearman_corr':
+      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      pearson_corr = stats.pearsonr(preds, labels)[0]
+      spearman_corr = stats.spearmanr(preds, labels)[0]
+      corr_metric = (pearson_corr + spearman_corr) / 2
+      return {self.metric_type: corr_metric}

  def initialize(self, model):
    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
-    pretrain_ckpt_dir = self.task_config.pretrain_checkpoint_dir
-    if not pretrain_ckpt_dir:
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
      return

    pretrain2finetune_mapping = {
@@ -138,10 +184,7 @@ class SentencePredictionTask(base_task.Task):
            model.checkpoint_items['sentence_prediction.pooler_dense'],
    }
    ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping)
-    latest_pretrain_ckpt = tf.train.latest_checkpoint(pretrain_ckpt_dir)
-    if latest_pretrain_ckpt is None:
-      raise FileNotFoundError(
-          'Cannot find pretrain checkpoint under {}'.format(pretrain_ckpt_dir))
-    status = ckpt.restore(latest_pretrain_ckpt)
+    status = ckpt.restore(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
-    logging.info('finished loading pretrained checkpoint.')
+    logging.info('finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
--- a/official/nlp/tasks/sentence_prediction_test.py
+++ b/official/nlp/tasks/sentence_prediction_test.py
@@ -16,6 +16,8 @@
 """Tests for official.nlp.tasks.sentence_prediction."""
 import functools
 import os
+
+from absl.testing import parameterized
 import tensorflow as tf

 from official.nlp.bert import configs
@@ -25,7 +27,24 @@ from official.nlp.configs import encoders
 from official.nlp.tasks import sentence_prediction


-class SentencePredictionTaskTest(tf.test.TestCase):
+class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(SentencePredictionTaskTest, self).setUp()
+    self._train_data_config = bert.SentencePredictionDataConfig(
+        input_path="dummy", seq_length=128, global_batch_size=1)
+
+  def get_model_config(self, num_classes):
+    return bert.BertPretrainerConfig(
+        encoder=encoders.TransformerEncoderConfig(
+            vocab_size=30522, num_layers=1),
+        num_masked_tokens=0,
+        cls_heads=[
+            bert.ClsHeadConfig(
+                inner_dim=10,
+                num_classes=num_classes,
+                name="sentence_prediction")
+        ])

  def _run_task(self, config):
    task = sentence_prediction.SentencePredictionTask(config)
@@ -43,15 +62,9 @@ class SentencePredictionTaskTest(tf.test.TestCase):

  def test_task(self):
    config = sentence_prediction.SentencePredictionConfig(
-        network=bert.BertPretrainerConfig(
-            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
-            num_masked_tokens=0,
-            cls_heads=[
-                bert.ClsHeadConfig(
-                    inner_dim=10, num_classes=3, name="sentence_prediction")
-            ]),
-        train_data=bert.BertSentencePredictionDataConfig(
-            input_path="dummy", seq_length=128, global_batch_size=1))
+        init_checkpoint=self.get_temp_dir(),
+        model=self.get_model_config(2),
+        train_data=self._train_data_config)
    task = sentence_prediction.SentencePredictionTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
@@ -62,6 +75,58 @@ class SentencePredictionTaskTest(tf.test.TestCase):
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)

+    # Saves a checkpoint.
+    pretrain_cfg = bert.BertPretrainerConfig(
+        encoder=encoders.TransformerEncoderConfig(
+            vocab_size=30522, num_layers=1),
+        num_masked_tokens=20,
+        cls_heads=[
+            bert.ClsHeadConfig(
+                inner_dim=10, num_classes=3, name="next_sentence")
+        ])
+    pretrain_model = bert.instantiate_bertpretrainer_from_cfg(pretrain_cfg)
+    ckpt = tf.train.Checkpoint(
+        model=pretrain_model, **pretrain_model.checkpoint_items)
+    ckpt.save(config.init_checkpoint)
+    task.initialize(model)
+
+  @parameterized.parameters(("matthews_corrcoef", 2),
+                            ("pearson_spearman_corr", 1))
+  def test_np_metrics(self, metric_type, num_classes):
+    config = sentence_prediction.SentencePredictionConfig(
+        metric_type=metric_type,
+        init_checkpoint=self.get_temp_dir(),
+        model=self.get_model_config(num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(config)
+    model = task.build_model()
+    dataset = task.build_inputs(config.train_data)
+
+    iterator = iter(dataset)
+    strategy = tf.distribute.get_strategy()
+    distributed_outputs = strategy.run(
+        functools.partial(task.validation_step, model=model),
+        args=(next(iterator),))
+    outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                    distributed_outputs)
+    aggregated = task.aggregate_logs(step_outputs=outputs)
+    aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
+    self.assertIn(metric_type, task.reduce_aggregated_logs(aggregated))
+
+  def test_task_with_fit(self):
+    config = sentence_prediction.SentencePredictionConfig(
+        model=self.get_model_config(2), train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(config)
+    model = task.build_model()
+    model = task.compile_model(
+        model,
+        optimizer=tf.keras.optimizers.SGD(lr=0.1),
+        train_step=task.train_step,
+        metrics=task.build_metrics())
+    dataset = task.build_inputs(config.train_data)
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn("loss", logs.history)
+
  def _export_bert_tfhub(self):
    bert_config = configs.BertConfig(
        vocab_size=30522,
@@ -89,15 +154,8 @@ class SentencePredictionTaskTest(tf.test.TestCase):
    hub_module_url = self._export_bert_tfhub()
    config = sentence_prediction.SentencePredictionConfig(
        hub_module_url=hub_module_url,
-        network=bert.BertPretrainerConfig(
-            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
-            num_masked_tokens=0,
-            cls_heads=[
-                bert.ClsHeadConfig(
-                    inner_dim=10, num_classes=3, name="sentence_prediction")
-            ]),
-        train_data=bert.BertSentencePredictionDataConfig(
-            input_path="dummy", seq_length=128, global_batch_size=10))
+        model=self.get_model_config(2),
+        train_data=self._train_data_config)
    self._run_task(config)



--- a/official/nlp/tasks/tagging.py
+++ b/official/nlp/tasks/tagging.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tagging (e.g., NER/POS) task."""
+import logging
+from typing import List, Optional
+
+import dataclasses
+
+from seqeval import metrics as seqeval_metrics
+
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from official.core import base_task
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.configs import encoders
+from official.nlp.data import tagging_data_loader
+from official.nlp.modeling import models
+from official.nlp.tasks import utils
+
+
+@dataclasses.dataclass
+class TaggingConfig(cfg.TaskConfig):
+  """The model config."""
+  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
+  init_checkpoint: str = ''
+  hub_module_url: str = ''
+  model: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+
+  # The real class names, the order of which should match real label id.
+  # Note that a word may be tokenized into multiple word_pieces tokens, and
+  # we asssume the real label id (non-negative) is assigned to the first token
+  # of the word, and a negative label id is assigned to the remaining tokens.
+  # The negative label id will not contribute to loss and metrics.
+  class_names: Optional[List[str]] = None
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+
+
+def _masked_labels_and_weights(y_true):
+  """Masks negative values from token level labels.
+
+  Args:
+    y_true: Token labels, typically shape (batch_size, seq_len), where tokens
+      with negative labels should be ignored during loss/accuracy calculation.
+
+  Returns:
+    (masked_y_true, masked_weights) where `masked_y_true` is the input
+    with each negative label replaced with zero and `masked_weights` is 0.0
+    where negative labels were replaced and 1.0 for original labels.
+  """
+  # Ignore the classes of tokens with negative values.
+  mask = tf.greater_equal(y_true, 0)
+  # Replace negative labels, which are out of bounds for some loss functions,
+  # with zero.
+  masked_y_true = tf.where(mask, y_true, 0)
+  return masked_y_true, tf.cast(mask, tf.float32)
+
+
+@base_task.register_task_cls(TaggingConfig)
+class TaggingTask(base_task.Task):
+  """Task object for tagging (e.g., NER or POS)."""
+
+  def __init__(self, params=cfg.TaskConfig):
+    super(TaggingTask, self).__init__(params)
+    if params.hub_module_url and params.init_checkpoint:
+      raise ValueError('At most one of `hub_module_url` and '
+                       '`init_checkpoint` can be specified.')
+    if not params.class_names:
+      raise ValueError('TaggingConfig.class_names cannot be empty.')
+
+    if params.hub_module_url:
+      self._hub_module = hub.load(params.hub_module_url)
+    else:
+      self._hub_module = None
+
+  def build_model(self):
+    if self._hub_module:
+      encoder_network = utils.get_encoder_from_hub(self._hub_module)
+    else:
+      encoder_network = encoders.instantiate_encoder_from_cfg(
+          self.task_config.model)
+
+    return models.BertTokenClassifier(
+        network=encoder_network,
+        num_classes=len(self.task_config.class_names),
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=self.task_config.model.initializer_range),
+        dropout_rate=self.task_config.model.dropout_rate,
+        output='logits')
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    model_outputs = tf.cast(model_outputs, tf.float32)
+    masked_labels, masked_weights = _masked_labels_and_weights(labels)
+    loss = tf.keras.losses.sparse_categorical_crossentropy(
+        masked_labels, model_outputs, from_logits=True)
+    numerator_loss = tf.reduce_sum(loss * masked_weights)
+    denominator_loss = tf.reduce_sum(masked_weights)
+    loss = tf.math.divide_no_nan(numerator_loss, denominator_loss)
+    return loss
+
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for sentence_prediction task."""
+    if params.input_path == 'dummy':
+
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        x = dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids)
+
+        # Include some label_id as -1, which will be ignored in loss/metrics.
+        y = tf.random.uniform(
+            shape=(1, params.seq_length),
+            minval=-1,
+            maxval=len(self.task_config.class_names),
+            dtype=tf.dtypes.int32)
+        return (x, y)
+
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    dataset = tagging_data_loader.TaggingDataLoader(params).load(input_context)
+    return dataset
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+    outputs = self.inference_step(features, model)
+    loss = self.build_losses(labels=labels, model_outputs=outputs)
+
+    # Negative label ids are padding labels which should be ignored.
+    real_label_index = tf.where(tf.greater_equal(labels, 0))
+    predict_ids = tf.math.argmax(outputs, axis=-1)
+    predict_ids = tf.gather_nd(predict_ids, real_label_index)
+    label_ids = tf.gather_nd(labels, real_label_index)
+    return {
+        self.loss: loss,
+        'predict_ids': predict_ids,
+        'label_ids': label_ids,
+    }
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    """Aggregates over logs returned from a validation step."""
+    if state is None:
+      state = {'predict_class': [], 'label_class': []}
+
+    def id_to_class_name(batched_ids):
+      class_names = []
+      for per_example_ids in batched_ids:
+        class_names.append([])
+        for per_token_id in per_example_ids.numpy().tolist():
+          class_names[-1].append(self.task_config.class_names[per_token_id])
+
+      return class_names
+
+    # Convert id to class names, because `seqeval_metrics` relies on the class
+    # name to decide IOB tags.
+    state['predict_class'].extend(id_to_class_name(step_outputs['predict_ids']))
+    state['label_class'].extend(id_to_class_name(step_outputs['label_ids']))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs):
+    """Reduces aggregated logs over validation steps."""
+    label_class = aggregated_logs['label_class']
+    predict_class = aggregated_logs['predict_class']
+    return {
+        'f1':
+            seqeval_metrics.f1_score(label_class, predict_class),
+        'precision':
+            seqeval_metrics.precision_score(label_class, predict_class),
+        'recall':
+            seqeval_metrics.recall_score(label_class, predict_class),
+        'accuracy':
+            seqeval_metrics.accuracy_score(label_class, predict_class),
+    }
+
+  def initialize(self, model):
+    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    status = ckpt.restore(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info('finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
--- a/official/nlp/tasks/tagging_test.py
+++ b/official/nlp/tasks/tagging_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.nlp.tasks.tagging."""
+import functools
+import os
+import tensorflow as tf
+
+from official.nlp.bert import configs
+from official.nlp.bert import export_tfhub
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.tasks import tagging
+
+
+class TaggingTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(TaggingTest, self).setUp()
+    self._encoder_config = encoders.TransformerEncoderConfig(
+        vocab_size=30522, num_layers=1)
+    self._train_data_config = bert.TaggingDataConfig(
+        input_path="dummy", seq_length=128, global_batch_size=1)
+
+  def _run_task(self, config):
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+
+    strategy = tf.distribute.get_strategy()
+    dataset = strategy.experimental_distribute_datasets_from_function(
+        functools.partial(task.build_inputs, config.train_data))
+
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+
+  def test_task(self):
+    # Saves a checkpoint.
+    encoder = encoders.instantiate_encoder_from_cfg(self._encoder_config)
+    ckpt = tf.train.Checkpoint(encoder=encoder)
+    saved_path = ckpt.save(self.get_temp_dir())
+
+    config = tagging.TaggingConfig(
+        init_checkpoint=saved_path,
+        model=self._encoder_config,
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    dataset = task.build_inputs(config.train_data)
+
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+    task.initialize(model)
+
+  def test_task_with_fit(self):
+    config = tagging.TaggingConfig(
+        model=self._encoder_config,
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    model = task.compile_model(
+        model,
+        optimizer=tf.keras.optimizers.SGD(lr=0.1),
+        train_step=task.train_step,
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
+    dataset = task.build_inputs(config.train_data)
+    logs = model.fit(dataset, epochs=1, steps_per_epoch=2)
+    self.assertIn("loss", logs.history)
+    self.assertIn("accuracy", logs.history)
+
+  def _export_bert_tfhub(self):
+    bert_config = configs.BertConfig(
+        vocab_size=30522,
+        hidden_size=16,
+        intermediate_size=32,
+        max_position_embeddings=128,
+        num_attention_heads=2,
+        num_hidden_layers=1)
+    _, encoder = export_tfhub.create_bert_model(bert_config)
+    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+    checkpoint = tf.train.Checkpoint(model=encoder)
+    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
+    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
+
+    vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt")
+    with tf.io.gfile.GFile(vocab_file, "w") as f:
+      f.write("dummy content")
+
+    hub_destination = os.path.join(self.get_temp_dir(), "hub")
+    export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path,
+                                   hub_destination, vocab_file)
+    return hub_destination
+
+  def test_task_with_hub(self):
+    hub_module_url = self._export_bert_tfhub()
+    config = tagging.TaggingConfig(
+        hub_module_url=hub_module_url,
+        model=self._encoder_config,
+        class_names=["O", "B-PER", "I-PER"],
+        train_data=self._train_data_config)
+    self._run_task(config)
+
+  def test_seqeval_metrics(self):
+    config = tagging.TaggingConfig(
+        model=self._encoder_config,
+        train_data=self._train_data_config,
+        class_names=["O", "B-PER", "I-PER"])
+    task = tagging.TaggingTask(config)
+    model = task.build_model()
+    dataset = task.build_inputs(config.train_data)
+
+    iterator = iter(dataset)
+    strategy = tf.distribute.get_strategy()
+    distributed_outputs = strategy.run(
+        functools.partial(task.validation_step, model=model),
+        args=(next(iterator),))
+    outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                    distributed_outputs)
+    aggregated = task.aggregate_logs(step_outputs=outputs)
+    aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
+    self.assertCountEqual({"f1", "precision", "recall", "accuracy"},
+                          task.reduce_aggregated_logs(aggregated).keys())
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/r1/utils/logs/cloud_lib_test.py
+++ b/official/r1/utils/logs/cloud_lib_test.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,37 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Tests for cloud_lib."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-import mock
-import requests
-
-from official.r1.utils.logs import cloud_lib
-
-
-class CloudLibTest(unittest.TestCase):
-
-  @mock.patch("requests.get")
-  def test_on_gcp(self, mock_requests_get):
-    mock_response = mock.MagicMock()
-    mock_requests_get.return_value = mock_response
-    mock_response.status_code = 200
-
-    self.assertEqual(cloud_lib.on_gcp(), True)
-
-  @mock.patch("requests.get")
-  def test_not_on_gcp(self, mock_requests_get):
-    mock_requests_get.side_effect = requests.exceptions.ConnectionError()
-
-    self.assertEqual(cloud_lib.on_gcp(), False)
-
-
-if __name__ == "__main__":
-  unittest.main()
+"""Common utils for tasks."""
+import tensorflow as tf
+import tensorflow_hub as hub
+
+
+def get_encoder_from_hub(hub_module: str) -> tf.keras.Model:
+  """Gets an encoder from hub."""
+  input_word_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_type_ids')
+  hub_layer = hub.KerasLayer(hub_module, trainable=True)
+  pooled_output, sequence_output = hub_layer(
+      [input_word_ids, input_mask, input_type_ids])
+  return tf.keras.Model(
+      inputs=[input_word_ids, input_mask, input_type_ids],
+      outputs=[sequence_output, pooled_output])