pull latest

657dcda5 · Kaushik Shivakumar · 26e24e21 · e6017471 · 657dcda5 · 657dcda5
Commit 657dcda5 authored Jul 01, 2020 by Kaushik Shivakumar
20 changed files
--- a/README.md
+++ b/README.md
@@ -15,11 +15,12 @@ can take full advantage of TensorFlow for their research and product development

 | Date | News |
 |------|------|
-| June 17, 2020 | [Context R-CNN: Long Term Temporal Context for Per-Camera Object Detection](https://github.com/tensorflow/models/tree/master/research/object_detection#june-17th-2020) released
-| May 21, 2020 | [Unifying Deep Local and Global Features for Image Search (DELG)](https://github.com/tensorflow/models/tree/master/research/delf#delg) code released
-| May 19, 2020 | [MobileDets: Searching for Object Detection Architectures for Mobile Accelerators](https://github.com/tensorflow/models/tree/master/research/object_detection#may-19th-2020) released
-| May 7, 2020 | [MnasFPN with MobileNet-V2 backbone](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md#mobile-models) released for object detection
-| May 1, 2020 | [DELF: DEep Local Features](https://github.com/tensorflow/models/tree/master/research/delf) updated to support TensorFlow 2.1
+| June 30, 2020 | [SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization](https://github.com/tensorflow/models/tree/master/official/vision/detection#train-a-spinenet-49-based-mask-r-cnn) released ([Tweet](https://twitter.com/GoogleAI/status/1278016712978264064)) |
+| June 17, 2020 | [Context R-CNN: Long Term Temporal Context for Per-Camera Object Detection](https://github.com/tensorflow/models/tree/master/research/object_detection#june-17th-2020) released ([Tweet](https://twitter.com/GoogleAI/status/1276571419422253057)) |
+| May 21, 2020 | [Unifying Deep Local and Global Features for Image Search (DELG)](https://github.com/tensorflow/models/tree/master/research/delf#delg) code released |
+| May 19, 2020 | [MobileDets: Searching for Object Detection Architectures for Mobile Accelerators](https://github.com/tensorflow/models/tree/master/research/object_detection#may-19th-2020) released |
+| May 7, 2020 | [MnasFPN with MobileNet-V2 backbone](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md#mobile-models) released for object detection |
+| May 1, 2020 | [DELF: DEep Local Features](https://github.com/tensorflow/models/tree/master/research/delf) updated to support TensorFlow 2.1 |
 | March 31, 2020 | [Introducing the Model Garden for TensorFlow 2](https://blog.tensorflow.org/2020/03/introducing-model-garden-for-tensorflow-2.html) ([Tweet](https://twitter.com/TensorFlow/status/1245029834633297921)) |

 ## [Milestones](https://github.com/tensorflow/models/milestones)

--- a/official/README.md
+++ b/official/README.md
@@ -19,9 +19,10 @@ In the near future, we will add:

 * State-of-the-art language understanding models:
  More members in Transformer family
-* Start-of-the-art image classification models:
+* State-of-the-art image classification models:
  EfficientNet, MnasNet, and variants
-* A set of excellent objection detection models.
+* State-of-the-art objection detection and instance segmentation models:
+  RetinaNet, Mask R-CNN, SpineNet, and variants

 ## Table of Contents

@@ -52,6 +53,7 @@ In the near future, we will add:
 | [RetinaNet](vision/detection) | [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) |
 | [Mask R-CNN](vision/detection) | [Mask R-CNN](https://arxiv.org/abs/1703.06870) |
 | [ShapeMask](vision/detection) | [ShapeMask: Learning to Segment Novel Objects by Refining Shape Priors](https://arxiv.org/abs/1904.03239) |
+| [SpineNet](vision/detection) | [SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization](https://arxiv.org/abs/1912.05027) |

 ### Natural Language Processing


--- a/official/benchmark/retinanet_benchmark.py
+++ b/official/benchmark/retinanet_benchmark.py
@@ -271,6 +271,23 @@ class RetinanetBenchmarkReal(RetinanetAccuracy):
    FLAGS.strategy_type = 'tpu'
    self._run_and_report_benchmark(params, do_eval=False, warmup=0)

+  @flagsaver.flagsaver
+  def benchmark_2x2_tpu_spinenet_coco(self):
+    """Run SpineNet with RetinaNet model accuracy test with 4 TPUs."""
+    self._setup()
+    params = self._params()
+    params['architecture']['backbone'] = 'spinenet'
+    params['architecture']['multilevel_features'] = 'identity'
+    params['architecture']['use_bfloat16'] = False
+    params['train']['batch_size'] = 64
+    params['train']['total_steps'] = 1875  # One epoch.
+    params['train']['iterations_per_loop'] = 500
+    params['train']['checkpoint']['path'] = ''
+    FLAGS.model_dir = self._get_model_dir(
+        'real_benchmark_2x2_tpu_spinenet_coco')
+    FLAGS.strategy_type = 'tpu'
+    self._run_and_report_benchmark(params, do_eval=False, warmup=0)
+

 if __name__ == '__main__':
  tf.test.main()
--- a/official/core/base_task.py
+++ b/official/core/base_task.py
@@ -37,13 +37,25 @@ class Task(tf.Module):
  # Special keys in train/validate step returned logs.
  loss = "loss"

-  def __init__(self, params: cfg.TaskConfig):
+  def __init__(self, params: cfg.TaskConfig, logging_dir: str = None):
+    """Task initialization.
+
+    Args:
+      params: cfg.TaskConfig instance.
+      logging_dir: a string pointing to where the model, summaries etc. will be
+        saved. You can also write additional stuff in this directory.
+    """
    self._task_config = params
+    self._logging_dir = logging_dir

  @property
  def task_config(self) -> cfg.TaskConfig:
    return self._task_config

+  @property
+  def logging_dir(self) -> str:
+    return self._logging_dir
+
  def initialize(self, model: tf.keras.Model):
    """A callback function used as CheckpointManager's init_fn.

@@ -107,6 +119,7 @@ class Task(tf.Module):
    """Returns a dataset or a nested structure of dataset functions.

    Dataset functions define per-host datasets with the per-replica batch size.
+    With distributed training, this method runs on remote hosts.

    Args:
      params: hyperparams to create input pipelines.
@@ -172,6 +185,8 @@ class Task(tf.Module):
                 metrics=None):
    """Does forward and backward.

+    With distribution strategies, this method runs on devices.
+
    Args:
      inputs: a dictionary of input tensors.
      model: the model, forward pass definition.
@@ -219,6 +234,8 @@ class Task(tf.Module):
  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
    """Validatation step.

+    With distribution strategies, this method runs on devices.
+
    Args:
      inputs: a dictionary of input tensors.
      model: the keras.Model.
@@ -244,7 +261,17 @@ class Task(tf.Module):
    return logs

  def inference_step(self, inputs, model: tf.keras.Model):
-    """Performs the forward step."""
+    """Performs the forward step.
+
+    With distribution strategies, this method runs on devices.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+
+    Returns:
+      Model outputs.
+    """
    return model(inputs, training=False)

  def aggregate_logs(self, state, step_logs):

--- a/official/nlp/configs/bert.py
+++ b/official/nlp/configs/bert.py
@@ -126,10 +126,17 @@ class QADataConfig(cfg.DataConfig):
 class QADevDataConfig(cfg.DataConfig):
  """Dev Data config for queston answering (tasks/question_answering)."""
  input_path: str = ""
+  input_preprocessed_data_path: str = ""
+  version_2_with_negative: bool = False
+  doc_stride: int = 128
  global_batch_size: int = 48
  is_training: bool = False
  seq_length: int = 384
+  query_length: int = 64
  drop_remainder: bool = False
+  vocab_file: str = ""
+  tokenization: str = "WordPiece"  # WordPiece or SentencePiece
+  do_lower_case: bool = True


 @dataclasses.dataclass

--- a/official/nlp/configs/electra.py
+++ b/official/nlp/configs/electra.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ELECTRA model configurations and instantiation methods."""
+from typing import List, Optional
+
+import dataclasses
+import tensorflow as tf
+
+from official.modeling import tf_utils
+from official.modeling.hyperparams import base_config
+from official.nlp.configs import bert
+from official.nlp.configs import encoders
+from official.nlp.modeling import layers
+from official.nlp.modeling.models import electra_pretrainer
+
+
+@dataclasses.dataclass
+class ELECTRAPretrainerConfig(base_config.Config):
+  """ELECTRA pretrainer configuration."""
+  num_masked_tokens: int = 76
+  sequence_length: int = 512
+  num_classes: int = 2
+  discriminator_loss_weight: float = 50.0
+  generator_encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+  discriminator_encoder: encoders.TransformerEncoderConfig = (
+      encoders.TransformerEncoderConfig())
+  cls_heads: List[bert.ClsHeadConfig] = dataclasses.field(default_factory=list)
+
+
+def instantiate_classification_heads_from_cfgs(
+    cls_head_configs: List[bert.ClsHeadConfig]
+) -> List[layers.ClassificationHead]:
+  if cls_head_configs:
+    return [
+        layers.ClassificationHead(**cfg.as_dict()) for cfg in cls_head_configs
+    ]
+  else:
+    return []
+
+
+def instantiate_pretrainer_from_cfg(
+    config: ELECTRAPretrainerConfig,
+    generator_network: Optional[tf.keras.Model] = None,
+    discriminator_network: Optional[tf.keras.Model] = None,
+    ) -> electra_pretrainer.ElectraPretrainer:
+  """Instantiates ElectraPretrainer from the config."""
+  generator_encoder_cfg = config.generator_encoder
+  discriminator_encoder_cfg = config.discriminator_encoder
+  if generator_network is None:
+    generator_network = encoders.instantiate_encoder_from_cfg(
+        generator_encoder_cfg)
+  if discriminator_network is None:
+    discriminator_network = encoders.instantiate_encoder_from_cfg(
+        discriminator_encoder_cfg)
+  return electra_pretrainer.ElectraPretrainer(
+      generator_network=generator_network,
+      discriminator_network=discriminator_network,
+      vocab_size=config.generator_encoder.vocab_size,
+      num_classes=config.num_classes,
+      sequence_length=config.sequence_length,
+      last_hidden_dim=config.generator_encoder.hidden_size,
+      num_token_predictions=config.num_masked_tokens,
+      mlm_activation=tf_utils.get_activation(
+          generator_encoder_cfg.hidden_activation),
+      mlm_initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=generator_encoder_cfg.initializer_range),
+      classification_heads=instantiate_classification_heads_from_cfgs(
+          config.cls_heads))
--- a/official/nlp/configs/electra_test.py
+++ b/official/nlp/configs/electra_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ELECTRA configurations and models instantiation."""
+
+import tensorflow as tf
+
+from official.nlp.configs import bert
+from official.nlp.configs import electra
+from official.nlp.configs import encoders
+
+
+class ELECTRAModelsTest(tf.test.TestCase):
+
+  def test_network_invocation(self):
+    config = electra.ELECTRAPretrainerConfig(
+        generator_encoder=encoders.TransformerEncoderConfig(
+            vocab_size=10, num_layers=1),
+        discriminator_encoder=encoders.TransformerEncoderConfig(
+            vocab_size=10, num_layers=2),
+    )
+    _ = electra.instantiate_pretrainer_from_cfg(config)
+
+    # Invokes with classification heads.
+    config = electra.ELECTRAPretrainerConfig(
+        generator_encoder=encoders.TransformerEncoderConfig(
+            vocab_size=10, num_layers=1),
+        discriminator_encoder=encoders.TransformerEncoderConfig(
+            vocab_size=10, num_layers=2),
+        cls_heads=[
+            bert.ClsHeadConfig(
+                inner_dim=10, num_classes=2, name="next_sentence")
+        ])
+    _ = electra.instantiate_pretrainer_from_cfg(config)
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -302,14 +302,15 @@ class PawsxProcessor(DataProcessor):
    """See base class."""
    lines = []
    for lang in PawsxProcessor.supported_languages:
-      lines.extend(self._read_tsv(os.path.join(data_dir, f"dev-{lang}.tsv")))
+      lines.extend(
+          self._read_tsv(os.path.join(data_dir, lang, "dev_2k.tsv"))[1:])

    examples = []
    for (i, line) in enumerate(lines):
      guid = "dev-%d" % i
-      text_a = self.process_text_fn(line[0])
-      text_b = self.process_text_fn(line[1])
-      label = self.process_text_fn(line[2])
+      text_a = self.process_text_fn(line[1])
+      text_b = self.process_text_fn(line[2])
+      label = self.process_text_fn(line[3])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples
@@ -318,12 +319,12 @@ class PawsxProcessor(DataProcessor):
    """See base class."""
    examples_by_lang = {k: [] for k in self.supported_languages}
    for lang in self.supported_languages:
-      lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv"))
+      lines = self._read_tsv(os.path.join(data_dir, lang, "test_2k.tsv"))[1:]
      for (i, line) in enumerate(lines):
        guid = "test-%d" % i
-        text_a = self.process_text_fn(line[0])
-        text_b = self.process_text_fn(line[1])
-        label = self.process_text_fn(line[2])
+        text_a = self.process_text_fn(line[1])
+        text_b = self.process_text_fn(line[2])
+        label = self.process_text_fn(line[3])
        examples_by_lang[lang].append(
            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples_by_lang

--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -32,14 +32,16 @@ from official.nlp.data import sentence_retrieval_lib
 from official.nlp.data import squad_lib as squad_lib_wp
 # sentence-piece tokenizer based squad_lib
 from official.nlp.data import squad_lib_sp
+from official.nlp.data import tagging_data_lib

 FLAGS = flags.FLAGS

+# TODO(chendouble): consider moving each task to its own binary.
 flags.DEFINE_enum(
    "fine_tuning_task_type", "classification",
-    ["classification", "regression", "squad", "retrieval"],
+    ["classification", "regression", "squad", "retrieval", "tagging"],
    "The name of the BERT fine tuning task for which data "
-    "will be generated..")
+    "will be generated.")

 # BERT classification specific flags.
 flags.DEFINE_string(
@@ -56,9 +58,6 @@ flags.DEFINE_enum("classification_task_name", "MNLI",
                  "only and for XNLI is all languages combined. Same for "
                  "PAWS-X.")

-flags.DEFINE_enum("retrieval_task_name", "bucc", ["bucc", "tatoeba"],
-                  "The name of sentence retrieval task for scoring")
-
 # XNLI task specific flag.
 flags.DEFINE_string(
    "xnli_language", "en",
@@ -71,6 +70,14 @@ flags.DEFINE_string(
    "Language of trainig data for PAWS-X task. If the value is 'all', the data "
    "of all languages will be used for training.")

+# Retrieva task specific flags
+flags.DEFINE_enum("retrieval_task_name", "bucc", ["bucc", "tatoeba"],
+                  "The name of sentence retrieval task for scoring")
+
+# Tagging task specific flags
+flags.DEFINE_enum("tagging_task_name", "panx", ["panx", "udpos"],
+                  "The name of BERT tagging (token classification) task.")
+
 # BERT Squad task specific flags.
 flags.DEFINE_string(
    "squad_data_file", None,
@@ -284,6 +291,34 @@ def generate_retrieval_dataset():
      FLAGS.max_seq_length)


+def generate_tagging_dataset():
+  """Generates tagging dataset."""
+  processors = {
+      "panx": tagging_data_lib.PanxProcessor,
+      "udpos": tagging_data_lib.UdposProcessor,
+  }
+  task_name = FLAGS.tagging_task_name.lower()
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % task_name)
+
+  if FLAGS.tokenizer_impl == "word_piece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    processor_text_fn = tokenization.convert_to_unicode
+  elif FLAGS.tokenizer_impl == "sentence_piece":
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = functools.partial(
+        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+  else:
+    raise ValueError("Unsupported tokenizer_impl: %s" % FLAGS.tokenizer_impl)
+
+  processor = processors[task_name]()
+  return tagging_data_lib.generate_tf_record_from_data_file(
+      processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length,
+      FLAGS.train_data_output_path, FLAGS.eval_data_output_path,
+      FLAGS.test_data_output_path, processor_text_fn)
+
+
 def main(_):
  if FLAGS.tokenizer_impl == "word_piece":
    if not FLAGS.vocab_file:
@@ -304,8 +339,11 @@ def main(_):
    input_meta_data = generate_regression_dataset()
  elif FLAGS.fine_tuning_task_type == "retrieval":
    input_meta_data = generate_retrieval_dataset()
-  else:
+  elif FLAGS.fine_tuning_task_type == "squad":
    input_meta_data = generate_squad_dataset()
+  else:
+    assert FLAGS.fine_tuning_task_type == "tagging"
+    input_meta_data = generate_tagging_dataset()

  tf.io.gfile.makedirs(os.path.dirname(FLAGS.meta_data_file_path))
  with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer:

--- a/official/nlp/data/tagging_data_lib.py
+++ b/official/nlp/data/tagging_data_lib.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library to process data for tagging task such as NER/POS."""
+import collections
+import os
+
+from absl import logging
+import tensorflow as tf
+
+from official.nlp.data import classifier_data_lib
+
+# A negative label id for the padding label, which will not contribute
+# to loss/metrics in training.
+_PADDING_LABEL_ID = -1
+
+# The special unknown token, used to substitute a word which has too many
+# subwords after tokenization.
+_UNK_TOKEN = "[UNK]"
+
+
+class InputExample(object):
+  """A single training/test example for token classification."""
+
+  def __init__(self, sentence_id, words=None, label_ids=None):
+    """Constructs an InputExample."""
+    self.sentence_id = sentence_id
+    self.words = words if words else []
+    self.label_ids = label_ids if label_ids else []
+
+  def add_word_and_label_id(self, word, label_id):
+    """Adds word and label_id pair in the example."""
+    self.words.append(word)
+    self.label_ids.append(label_id)
+
+
+def _read_one_file(file_name, label_list):
+  """Reads one file and returns a list of `InputExample` instances."""
+  lines = tf.io.gfile.GFile(file_name, "r").readlines()
+  examples = []
+  label_id_map = {label: i for i, label in enumerate(label_list)}
+  sentence_id = 0
+  example = InputExample(sentence_id=0)
+  for line in lines:
+    line = line.strip("\n")
+    if line:
+      # The format is: <token>\t<label> for train/dev set and <token> for test.
+      items = line.split("\t")
+      assert len(items) == 2 or len(items) == 1
+      token = items[0].strip()
+
+      # Assign a dummy label_id for test set
+      label_id = label_id_map[items[1].strip()] if len(items) == 2 else 0
+      example.add_word_and_label_id(token, label_id)
+    else:
+      # Empty line indicates a new sentence.
+      if example.words:
+        examples.append(example)
+        sentence_id += 1
+        example = InputExample(sentence_id=sentence_id)
+
+  if example.words:
+    examples.append(example)
+  return examples
+
+
+class PanxProcessor(classifier_data_lib.DataProcessor):
+  """Processor for the Panx data set."""
+  supported_languages = [
+      "ar", "he", "vi", "id", "jv", "ms", "tl", "eu", "ml", "ta", "te", "af",
+      "nl", "en", "de", "el", "bn", "hi", "mr", "ur", "fa", "fr", "it", "pt",
+      "es", "bg", "ru", "ja", "ka", "ko", "th", "sw", "yo", "my", "zh", "kk",
+      "tr", "et", "fi", "hu"
+  ]
+
+  def get_train_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "train-en.tsv"), self.get_labels())
+
+  def get_dev_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
+
+  def get_test_examples(self, data_dir):
+    examples_dict = {}
+    for language in self.supported_languages:
+      examples_dict[language] = _read_one_file(
+          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
+    return examples_dict
+
+  def get_labels(self):
+    return ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
+
+  @staticmethod
+  def get_processor_name():
+    return "panx"
+
+
+class UdposProcessor(classifier_data_lib.DataProcessor):
+  """Processor for the Udpos data set."""
+  supported_languages = [
+      "af", "ar", "bg", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr",
+      "he", "hi", "hu", "id", "it", "ja", "kk", "ko", "mr", "nl", "pt", "ru",
+      "ta", "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"
+  ]
+
+  def get_train_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "train-en.tsv"), self.get_labels())
+
+  def get_dev_examples(self, data_dir):
+    return _read_one_file(
+        os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
+
+  def get_test_examples(self, data_dir):
+    examples_dict = {}
+    for language in self.supported_languages:
+      examples_dict[language] = _read_one_file(
+          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
+    return examples_dict
+
+  def get_labels(self):
+    return [
+        "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM",
+        "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
+    ]
+
+  @staticmethod
+  def get_processor_name():
+    return "udpos"
+
+
+def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
+  """Tokenizes words and breaks long example into short ones."""
+  # Needs additional [CLS] and [SEP] tokens.
+  max_length = max_length - 2
+  new_examples = []
+  new_example = InputExample(sentence_id=example.sentence_id)
+  for i, word in enumerate(example.words):
+    if any([x < 0 for x in example.label_ids]):
+      raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
+
+    if text_preprocessing:
+      word = text_preprocessing(word)
+    subwords = tokenizer.tokenize(word)
+    if (not subwords or len(subwords) > max_length) and word:
+      subwords = [_UNK_TOKEN]
+
+    if len(subwords) + len(new_example.words) > max_length:
+      # Start a new example.
+      new_examples.append(new_example)
+      new_example = InputExample(sentence_id=example.sentence_id)
+
+    for j, subword in enumerate(subwords):
+      # Use the real label for the first subword, and pad label for
+      # the remainings.
+      subword_label = example.label_ids[i] if j == 0 else _PADDING_LABEL_ID
+      new_example.add_word_and_label_id(subword, subword_label)
+
+  if new_example.words:
+    new_examples.append(new_example)
+
+  return new_examples
+
+
+def _convert_single_example(example, max_seq_length, tokenizer):
+  """Converts an `InputExample` instance to a `tf.train.Example` instance."""
+  tokens = ["[CLS]"]
+  tokens.extend(example.words)
+  tokens.append("[SEP]")
+  input_ids = tokenizer.convert_tokens_to_ids(tokens)
+  label_ids = [_PADDING_LABEL_ID]
+  label_ids.extend(example.label_ids)
+  label_ids.append(_PADDING_LABEL_ID)
+
+  segment_ids = [0] * len(input_ids)
+  input_mask = [1] * len(input_ids)
+
+  # Pad up to the sequence length.
+  while len(input_ids) < max_seq_length:
+    input_ids.append(0)
+    input_mask.append(0)
+    segment_ids.append(0)
+    label_ids.append(_PADDING_LABEL_ID)
+
+  def create_int_feature(values):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+
+  features = collections.OrderedDict()
+  features["input_ids"] = create_int_feature(input_ids)
+  features["input_mask"] = create_int_feature(input_mask)
+  features["segment_ids"] = create_int_feature(segment_ids)
+  features["label_ids"] = create_int_feature(label_ids)
+  features["sentence_id"] = create_int_feature([example.sentence_id])
+
+  tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+  return tf_example
+
+
+def write_example_to_file(examples,
+                          tokenizer,
+                          max_seq_length,
+                          output_file,
+                          text_preprocessing=None):
+  """Writes `InputExample`s into a tfrecord file with `tf.train.Example` protos.
+
+  Note that the words inside each example will be tokenized and be applied by
+  `text_preprocessing` if available. Also, if the length of sentence (plus
+  special [CLS] and [SEP] tokens) exceeds `max_seq_length`, the long sentence
+  will be broken into multiple short examples. For example:
+
+  Example (text_preprocessing=lowercase, max_seq_length=5)
+    words:        ["What", "a", "great", "weekend"]
+    labels:       [     7,   5,       9,        10]
+    sentence_id:  0
+    preprocessed: ["what", "a", "great", "weekend"]
+    tokenized:    ["what", "a", "great", "week", "##end"]
+
+  will result in two tf.example protos:
+
+    tokens:      ["[CLS]", "what", "a", "great", "[SEP]"]
+    label_ids:   [-1,       7,     5,     9,     -1]
+    input_mask:  [ 1,       1,     1,     1,      1]
+    segment_ids: [ 0,       0,     0,     0,      0]
+    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
+    sentence_id: 0
+
+    tokens:      ["[CLS]", "week", "##end", "[SEP]", "[PAD]"]
+    label_ids:   [-1,       10,     -1,    -1,       -1]
+    input_mask:  [ 1,       1,       1,     0,        0]
+    segment_ids: [ 0,       0,       0,     0,        0]
+    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
+    sentence_id: 0
+
+    Note the use of -1 in `label_ids` to indicate that a token should not be
+    considered for classification (e.g., trailing ## wordpieces or special
+    token). Token classification models should accordingly ignore these when
+    calculating loss, metrics, etc...
+
+  Args:
+    examples: A list of `InputExample` instances.
+    tokenizer: The tokenizer to be applied on the data.
+    max_seq_length: Maximum length of generated sequences.
+    output_file: The name of the output tfrecord file.
+    text_preprocessing: optional preprocessing run on each word prior to
+      tokenization.
+
+  Returns:
+    The total number of tf.train.Example proto written to file.
+  """
+  tf.io.gfile.makedirs(os.path.dirname(output_file))
+  writer = tf.io.TFRecordWriter(output_file)
+  num_tokenized_examples = 0
+  for (ex_index, example) in enumerate(examples):
+    if ex_index % 10000 == 0:
+      logging.info("Writing example %d of %d to %s", ex_index, len(examples),
+                   output_file)
+
+    tokenized_examples = _tokenize_example(example, max_seq_length,
+                                           tokenizer, text_preprocessing)
+    num_tokenized_examples += len(tokenized_examples)
+    for per_tokenized_example in tokenized_examples:
+      tf_example = _convert_single_example(
+          per_tokenized_example, max_seq_length, tokenizer)
+      writer.write(tf_example.SerializeToString())
+
+  writer.close()
+  return num_tokenized_examples
+
+
+def token_classification_meta_data(train_data_size,
+                                   max_seq_length,
+                                   num_labels,
+                                   eval_data_size=None,
+                                   test_data_size=None,
+                                   label_list=None,
+                                   processor_type=None):
+  """Creates metadata for tagging (token classification) datasets."""
+  meta_data = {
+      "train_data_size": train_data_size,
+      "max_seq_length": max_seq_length,
+      "num_labels": num_labels,
+      "task_type": "tagging",
+      "label_type": "int",
+      "label_shape": [max_seq_length],
+  }
+  if eval_data_size:
+    meta_data["eval_data_size"] = eval_data_size
+  if test_data_size:
+    meta_data["test_data_size"] = test_data_size
+  if label_list:
+    meta_data["label_list"] = label_list
+  if processor_type:
+    meta_data["processor_type"] = processor_type
+
+  return meta_data
+
+
+def generate_tf_record_from_data_file(processor,
+                                      data_dir,
+                                      tokenizer,
+                                      max_seq_length,
+                                      train_data_output_path,
+                                      eval_data_output_path,
+                                      test_data_output_path,
+                                      text_preprocessing):
+  """Generates tfrecord files from the raw data."""
+  common_kwargs = dict(tokenizer=tokenizer, max_seq_length=max_seq_length,
+                       text_preprocessing=text_preprocessing)
+  train_examples = processor.get_train_examples(data_dir)
+  train_data_size = write_example_to_file(
+      train_examples, output_file=train_data_output_path, **common_kwargs)
+
+  eval_examples = processor.get_dev_examples(data_dir)
+  eval_data_size = write_example_to_file(
+      eval_examples, output_file=eval_data_output_path, **common_kwargs)
+
+  test_input_data_examples = processor.get_test_examples(data_dir)
+  test_data_size = {}
+  for language, examples in test_input_data_examples.items():
+    test_data_size[language] = write_example_to_file(
+        examples,
+        output_file=test_data_output_path.format(language),
+        **common_kwargs)
+
+  labels = processor.get_labels()
+  meta_data = token_classification_meta_data(
+      train_data_size,
+      max_seq_length,
+      len(labels),
+      eval_data_size,
+      test_data_size,
+      label_list=labels,
+      processor_type=processor.get_processor_name())
+  return meta_data
--- a/official/nlp/modeling/losses/README.md
+++ b/official/nlp/modeling/losses/README.md
@@ -4,6 +4,3 @@ Losses contains common loss computation used in NLP tasks.

 * `weighted_sparse_categorical_crossentropy_loss` computes per-batch sparse
 categorical crossentropy loss.
-
-* `weighted_sparse_categorical_crossentropy_per_example_loss` computes
-per-example sparse categorical crossentropy loss.
--- a/official/nlp/modeling/losses/__init__.py
+++ b/official/nlp/modeling/losses/__init__.py
@@ -14,4 +14,3 @@
 # ==============================================================================
 """Activations package definition. Subject to change."""
 from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import loss as weighted_sparse_categorical_crossentropy_loss
-from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import per_example_loss as weighted_sparse_categorical_crossentropy_per_example_loss
--- a/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
+++ b/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sparse categorical cross-entropy losses."""
+"""Weighted sparse categorical cross-entropy losses."""

 from __future__ import absolute_import
 from __future__ import division
@@ -43,37 +43,7 @@ def _validate_rank(labels, predictions, weights):
         "predictions.shape was %s.") % (labels.shape, predictions.shape))


-def per_example_loss(labels, predictions, weights=None):
-  """Calculate a per-example sparse categorical crossentropy loss.
-
-  This loss function assumes that the predictions are post-softmax.
-  Args:
-    labels: The labels to evaluate against. Should be a set of integer indices
-      ranging from 0 to (vocab_size-1).
-    predictions: The network predictions. Should have softmax already applied.
-    weights: An optional weight array of the same shape as the 'labels' array.
-      If None, all examples will be used.
-
-  Returns:
-    A tensor of shape predictions.shape[:-1] containing the per-example
-      loss.
-  """
-  # When using these functions with the Keras core API, we will need to squeeze
-  # the labels tensor - Keras adds a spurious inner dimension.
-  labels, predictions = _adjust_labels(labels, predictions)
-  _validate_rank(labels, predictions, weights)
-
-  labels_one_hot = tf.one_hot(labels, predictions.shape[-1])
-  labels_one_hot = tf.cast(labels_one_hot, predictions.dtype)
-  per_example_loss_data = -tf.reduce_sum(
-      predictions * labels_one_hot, axis=[-1])
-  if weights is not None:
-    weights = tf.cast(weights, per_example_loss_data.dtype)
-    per_example_loss_data = weights * per_example_loss_data
-  return per_example_loss_data
-
-
-def loss(labels, predictions, weights=None):
+def loss(labels, predictions, weights=None, from_logits=False):
  """Calculate a per-batch sparse categorical crossentropy loss.

  This loss function assumes that the predictions are post-softmax.
@@ -83,6 +53,7 @@ def loss(labels, predictions, weights=None):
    predictions: The network predictions. Should have softmax already applied.
    weights: An optional weight array of the same shape as the 'labels' array.
      If None, all examples will be used.
+    from_logits: Whether the input predictions are logits.

  Returns:
    A loss scalar.
@@ -95,12 +66,11 @@ def loss(labels, predictions, weights=None):
  labels, predictions = _adjust_labels(labels, predictions)
  _validate_rank(labels, predictions, weights)

-  per_example_loss_data = per_example_loss(labels, predictions, weights)
+  example_losses = tf.keras.losses.sparse_categorical_crossentropy(
+      labels, predictions, from_logits=from_logits)

  if weights is None:
-    return tf.reduce_mean(per_example_loss_data)
-  else:
-    numerator = tf.reduce_sum(per_example_loss_data)
+    return tf.reduce_mean(example_losses)
  weights = tf.cast(weights, predictions.dtype)
-    denominator = tf.reduce_sum(weights) + 1e-5
-    return numerator / denominator
+  return tf.math.divide_no_nan(
+      tf.reduce_sum(example_losses * weights), tf.reduce_sum(weights))
--- a/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
+++ b/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
@@ -53,8 +53,7 @@ class ClassificationLossTest(keras_parameterized.TestCase):

    # Create a maskedLM from the transformer stack.
    test_layer = layers.MaskedLM(
-        embedding_table=xformer_stack.get_embedding_table(),
-        output=output)
+        embedding_table=xformer_stack.get_embedding_table(), output=output)

    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
@@ -63,123 +62,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    output = test_layer(lm_input_tensor, masked_positions=masked_lm_positions)
    return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)

-  def create_classification_model(self, input_width, num_classes):
-    test_object = networks.Classification(
-        input_width=input_width, num_classes=num_classes)
-    # Create a 2-dimensional input (the first dimension is implicit).
-    pooled_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
-    output = test_object(pooled_data)
-    return tf.keras.Model(pooled_data, output)
-
-  def test_per_example_loss_3d_input(self):
-    """Test per-example loss with a 3-dimensional input, from a masked LM."""
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
-
-    # Get the output of the masked LM.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    output_data = model.predict([lm_input_data, masked_position_data])
-
-    # Calculate per-example loss.
-    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
-
-    # Per-example loss data should have one value per prediction, and those
-    # values shouldn't be zero in this case (as we're using random data).
-    expected_shape = [batch_size, num_predictions]
-    self.assertEqual(expected_shape, per_example_loss_data.shape.as_list())
-    self.assertNotAllClose(
-        tf.zeros_like(per_example_loss_data), per_example_loss_data)
-
-  def test_per_example_loss_2d_input(self):
-    """Test per-example loss with a 2-d input, from a classifier."""
-    input_width = 512
-    num_classes = 10
-    model = self.create_classification_model(input_width, num_classes)
-
-    # Invoke the network as part of a Model.
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    output_data = model.predict(input_data)
-
-    # Calculate per example loss.
-    labels = np.random.randint(num_classes, size=(batch_size))
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
-
-    # Per-example loss data should have one value per batch item, and those
-    # values shouldn't be zero in this case (as we're using random data).
-    self.assertEqual([batch_size], per_example_loss_data.shape.as_list())
-    self.assertNotAllClose(
-        tf.zeros_like(per_example_loss_data), per_example_loss_data)
-
-  def test_per_example_loss_weights_3d_input(self):
-    """Test weighted per-example loss with a 3-d input, from a masked LM."""
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
-
-    # Get the output of the masked LM.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    output_data = model.predict([lm_input_data, masked_position_data])
-
-    # Calculate per-example loss with weights.
-    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
-    weights = np.random.randint(2, size=(batch_size, num_predictions))
-
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights)
-
-    # Weighted per-example loss data should be equivalent to multiplying the
-    # loss tensor by the weights tensor.
-    expected_weighted_loss = per_example_loss_data * weights
-    self.assertAllClose(expected_weighted_loss, per_example_loss_data)
-
-  def test_per_example_loss_weights_2d_input(self):
-    """Test weighted per-example loss with a 2-d input, from a classifier."""
-    input_width = 512
-    num_classes = 10
-    model = self.create_classification_model(input_width, num_classes)
-
-    # Invoke the network as part of a Model.
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    output_data = model.predict(input_data)
-
-    # Calculate per-example loss with weights.
-    labels = np.random.randint(num_classes, size=(batch_size))
-    weights = np.random.randint(2, size=(batch_size))
-
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights)
-
-    # Weighted per-example loss data should be equivalent to multiplying the
-    # loss tensor by the weights tensor.
-    expected_weighted_loss = per_example_loss_data * weights
-    self.assertAllClose(expected_weighted_loss, per_example_loss_data)
-
  def test_loss_3d_input(self):
    """Test overall loss with a 3-dimensional input, from a masked LM."""
    vocab_size = 100
@@ -213,26 +95,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    self.assertNotAllClose(
        tf.zeros_like(per_example_loss_data), per_example_loss_data)

-  def test_loss_2d_input(self):
-    """Test overall loss with a 2-d input, from a classifier."""
-    input_width = 512
-    num_classes = 10
-    model = self.create_classification_model(input_width, num_classes)
-
-    # Invoke the network as part of a Model.
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    output_data = model.predict(input_data)
-
-    # Calculate per example loss.
-    labels = np.random.randint(num_classes, size=(batch_size))
-    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels)
-
-    # Loss data should have one value only, and that value shouldn't be zero in
-    # this case (as we're using random data).
-    self.assertNotAllClose(0, loss_data)
-
  def test_loss_weights_3d_input(self):
    """Test masked loss with a 3-dimensional input, from a masked LM."""
    vocab_size = 100
@@ -262,26 +124,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    # Because the tensor is fully masked, the loss should be 0.
    self.assertAllClose(0, weighted_loss_data)

-  def test_loss_weights_2d_input(self):
-    """Test masked loss with a 2-d input, from a classifier."""
-    input_width = 512
-    num_classes = 10
-    model = self.create_classification_model(input_width, num_classes)
-
-    # Invoke the network as part of a Model.
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    output_data = model.predict(input_data)
-
-    # Calculate a fully masked weight tensor. This should give a loss of zero.
-    labels = np.random.randint(num_classes, size=(batch_size))
-    null_weights = np.zeros((batch_size))
-    weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=null_weights)
-
-    # Because the tensor is fully masked, the loss should be 0.
-    self.assertAllClose(0, weighted_loss_data)
-
  def test_mismatched_predictions_and_labels_ranks_squeezes(self):
    """Test that the loss asserts when rank(predictions)-1 != rank(labels)."""
    batch_size = 3
@@ -289,7 +131,7 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    labels = np.random.randint(10, size=(batch_size, 1))

    # All that this test tests is that the squeeze is successful.
-    _ = weighted_sparse_categorical_crossentropy.per_example_loss(
+    _ = weighted_sparse_categorical_crossentropy.loss(
        predictions=output_data, labels=labels)

  def test_mismatched_weights_and_labels_ranks_fail(self):
@@ -299,9 +141,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    labels = np.random.randint(10, size=(batch_size, 10))
    weights = np.random.randint(2, size=(batch_size))

-    with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
-      _ = weighted_sparse_categorical_crossentropy.per_example_loss(
-          predictions=output_data, labels=labels, weights=weights)
    with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
      _ = weighted_sparse_categorical_crossentropy.loss(
          predictions=output_data, labels=labels, weights=weights)
@@ -317,8 +156,6 @@ class ClassificationLossTest(keras_parameterized.TestCase):
    # We're not trying to validate numerical correctness, just ensure that
    # we can in fact pass tensors to these functions without causing runtime
    # errors from the shape checking code.
-    _ = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights)
    _ = weighted_sparse_categorical_crossentropy.loss(
        predictions=output_data, labels=labels, weights=weights)

@@ -338,20 +175,15 @@ class ClassificationLossTest(keras_parameterized.TestCase):
          [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509]]])
    labels = np.array([[4, 0], [2, 2], [2, 1]])

-    # Validate that per_example loss calculations are the same.
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
-    expected_per_example_loss_data = [[1.2923571, 2.7117882],
-                                      [2.287932, 2.287932],
-                                      [3.0924666, 1.8219438]]
-    self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
-
    # Validate that overall loss calculations are the same.
    weights = np.array([[1, 0], [0, 0], [0, 0]])
    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
+        predictions=output_data,
+        labels=labels,
+        weights=weights,
+        from_logits=True)
    expected_loss_data = 1.2923441
-    self.assertAllClose(expected_loss_data, loss_data)
+    self.assertAllClose(expected_loss_data, loss_data, rtol=1e-3)

  def test_legacy_classification_loss_compatibility(self):
    """Test to validate computational correctness during refactors."""
@@ -362,19 +194,15 @@ class ClassificationLossTest(keras_parameterized.TestCase):
                            [-1.6975292e-03, -6.4009643e+00, -1.0226612e+01]])
    labels = np.array([2, 1])

-    # Validate that per_example loss calculations are the same.
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
-    expected_per_example_loss_data = [6.4434357, 6.4009643]
-    self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
-
    # Validate that overall loss calculations are the same.
    weights = None
    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
+        predictions=output_data,
+        labels=labels,
+        weights=weights,
+        from_logits=True)
    expected_loss_data = 6.4222
-    self.assertAllClose(expected_loss_data, loss_data)
-
+    self.assertAllClose(expected_loss_data, loss_data, rtol=1e-3)

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/modeling/models/__init__.py
+++ b/official/nlp/modeling/models/__init__.py
@@ -17,3 +17,4 @@ from official.nlp.modeling.models.bert_classifier import BertClassifier
 from official.nlp.modeling.models.bert_pretrainer import BertPretrainer
 from official.nlp.modeling.models.bert_span_labeler import BertSpanLabeler
 from official.nlp.modeling.models.bert_token_classifier import BertTokenClassifier
+from official.nlp.modeling.models.electra_pretrainer import ElectraPretrainer
--- a/official/nlp/modeling/models/electra_pretrainer.py
+++ b/official/nlp/modeling/models/electra_pretrainer.py
@@ -116,6 +116,22 @@ class ElectraPretrainer(tf.keras.Model):
        units=1, kernel_initializer=mlm_initializer)

  def call(self, inputs):
+    """ELECTRA forward pass.
+
+    Args:
+      inputs: A dict of all inputs, same as the standard BERT model.
+
+    Returns:
+      outputs: A dict of pretrainer model outputs, including
+        (1) lm_outputs: a [batch_size, num_token_predictions, vocab_size] tensor
+        indicating logits on masked positions.
+        (2) sentence_outputs: a [batch_size, num_classes] tensor indicating
+        logits for nsp task.
+        (3) disc_logits: a [batch_size, sequence_length] tensor indicating
+        logits for discriminator replaced token detection task.
+        (4) disc_label: a [batch_size, sequence_length] tensor indicating
+        target labels for discriminator replaced token detection task.
+    """
    input_word_ids = inputs['input_word_ids']
    input_mask = inputs['input_mask']
    input_type_ids = inputs['input_type_ids']
@@ -152,7 +168,14 @@ class ElectraPretrainer(tf.keras.Model):
    disc_logits = self.discriminator_head(disc_sequence_output)
    disc_logits = tf.squeeze(disc_logits, axis=-1)

-    return lm_outputs, sentence_outputs, disc_logits, disc_label
+    outputs = {
+        'lm_outputs': lm_outputs,
+        'sentence_outputs': sentence_outputs,
+        'disc_logits': disc_logits,
+        'disc_label': disc_label,
+    }
+
+    return outputs

  def _get_fake_data(self, inputs, mlm_logits, duplicate=True):
    """Generate corrupted data for discriminator.

--- a/official/nlp/modeling/models/electra_pretrainer_test.py
+++ b/official/nlp/modeling/models/electra_pretrainer_test.py
@@ -69,7 +69,11 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
    }

    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    lm_outs, cls_outs, disc_logits, disc_label = eletrca_trainer_model(inputs)
+    outputs = eletrca_trainer_model(inputs)
+    lm_outs = outputs['lm_outputs']
+    cls_outs = outputs['sentence_outputs']
+    disc_logits = outputs['disc_logits']
+    disc_label = outputs['disc_label']

    # Validate that the outputs are of the expected shape.
    expected_lm_shape = [None, num_token_predictions, vocab_size]
@@ -117,7 +121,7 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
    # Invoke the trainer model on the tensors. In Eager mode, this does the
    # actual calculation. (We can't validate the outputs, since the network is
    # too complex: this simply ensures we're not hitting runtime errors.)
-    _, _, _, _ = eletrca_trainer_model(inputs)
+    _ = eletrca_trainer_model(inputs)

  def test_serialize_deserialize(self):
    """Validate that the ELECTRA trainer can be serialized and deserialized."""

--- a/official/nlp/tasks/masked_lm.py
+++ b/official/nlp/tasks/masked_lm.py
@@ -21,7 +21,6 @@ from official.core import base_task
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import bert
 from official.nlp.data import pretrain_dataloader
-from official.nlp.modeling import losses as loss_lib


 @dataclasses.dataclass
@@ -61,9 +60,10 @@ class MaskedLMTask(base_task.Task):
      sentence_labels = labels['next_sentence_labels']
      sentence_outputs = tf.cast(
          model_outputs['next_sentence'], dtype=tf.float32)
-      sentence_loss = loss_lib.weighted_sparse_categorical_crossentropy_loss(
-          labels=sentence_labels,
-          predictions=tf.nn.log_softmax(sentence_outputs, axis=-1))
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels,
+          sentence_outputs,
+          from_logits=True)
      metrics['next_sentence_loss'].update_state(sentence_loss)
      total_loss = mlm_loss + sentence_loss
    else:

--- a/official/nlp/tasks/question_answering.py
+++ b/official/nlp/tasks/question_answering.py
@@ -14,7 +14,10 @@
 # limitations under the License.
 # ==============================================================================
 """Question answering task."""
-import logging
+import collections
+import json
+import os
+from absl import logging
 import dataclasses
 import tensorflow as tf
 import tensorflow_hub as hub
@@ -22,7 +25,12 @@ import tensorflow_hub as hub
 from official.core import base_task
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.bert import input_pipeline
+from official.nlp.bert import squad_evaluate_v1_1
+from official.nlp.bert import squad_evaluate_v2_0
+from official.nlp.bert import tokenization
 from official.nlp.configs import encoders
+from official.nlp.data import squad_lib as squad_lib_wp
+from official.nlp.data import squad_lib_sp
 from official.nlp.modeling import models
 from official.nlp.tasks import utils

@@ -33,6 +41,9 @@ class QuestionAnsweringConfig(cfg.TaskConfig):
  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
  init_checkpoint: str = ''
  hub_module_url: str = ''
+  n_best_size: int = 20
+  max_answer_length: int = 30
+  null_score_diff_threshold: float = 0.0
  model: encoders.TransformerEncoderConfig = (
      encoders.TransformerEncoderConfig())
  train_data: cfg.DataConfig = cfg.DataConfig()
@@ -41,13 +52,10 @@ class QuestionAnsweringConfig(cfg.TaskConfig):

 @base_task.register_task_cls(QuestionAnsweringConfig)
 class QuestionAnsweringTask(base_task.Task):
-  """Task object for question answering.
+  """Task object for question answering."""

-  TODO(lehou): Add post-processing.
-  """
-
-  def __init__(self, params=cfg.TaskConfig):
-    super(QuestionAnsweringTask, self).__init__(params)
+  def __init__(self, params=cfg.TaskConfig, logging_dir=None):
+    super(QuestionAnsweringTask, self).__init__(params, logging_dir)
    if params.hub_module_url and params.init_checkpoint:
      raise ValueError('At most one of `hub_module_url` and '
                       '`init_checkpoint` can be specified.')
@@ -56,6 +64,18 @@ class QuestionAnsweringTask(base_task.Task):
    else:
      self._hub_module = None

+    if params.validation_data.tokenization == 'WordPiece':
+      self.squad_lib = squad_lib_wp
+    elif params.validation_data.tokenization == 'SentencePiece':
+      self.squad_lib = squad_lib_sp
+    else:
+      raise ValueError('Unsupported tokenization method: {}'.format(
+          params.validation_data.tokenization))
+
+    if params.validation_data.input_path:
+      self._tf_record_input_path, self._eval_examples, self._eval_features = (
+          self._preprocess_eval_data(params.validation_data))
+
  def build_model(self):
    if self._hub_module:
      encoder_network = utils.get_encoder_from_hub(self._hub_module)
@@ -85,9 +105,57 @@ class QuestionAnsweringTask(base_task.Task):
    loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
    return loss

+  def _preprocess_eval_data(self, params):
+    eval_examples = self.squad_lib.read_squad_examples(
+        input_file=params.input_path,
+        is_training=False,
+        version_2_with_negative=params.version_2_with_negative)
+
+    temp_file_path = params.input_preprocessed_data_path or self.logging_dir
+    if not temp_file_path:
+      raise ValueError('You must specify a temporary directory, either in '
+                       'params.input_preprocessed_data_path or logging_dir to '
+                       'store intermediate evaluation TFRecord data.')
+    eval_writer = self.squad_lib.FeatureWriter(
+        filename=os.path.join(temp_file_path, 'eval.tf_record'),
+        is_training=False)
+    eval_features = []
+
+    def _append_feature(feature, is_padding):
+      if not is_padding:
+        eval_features.append(feature)
+      eval_writer.process_feature(feature)
+
+    kwargs = dict(
+        examples=eval_examples,
+        tokenizer=tokenization.FullTokenizer(
+            vocab_file=params.vocab_file,
+            do_lower_case=params.do_lower_case),
+        max_seq_length=params.seq_length,
+        doc_stride=params.doc_stride,
+        max_query_length=params.query_length,
+        is_training=False,
+        output_fn=_append_feature,
+        batch_size=params.global_batch_size)
+    if params.tokenization == 'SentencePiece':
+      # squad_lib_sp requires one more argument 'do_lower_case'.
+      kwargs['do_lower_case'] = params.do_lower_case
+
+    eval_dataset_size = self.squad_lib.convert_examples_to_features(**kwargs)
+    eval_writer.close()
+
+    logging.info('***** Evaluation input stats *****')
+    logging.info('  Num orig examples = %d', len(eval_examples))
+    logging.info('  Num split examples = %d', len(eval_features))
+    logging.info('  Batch size = %d', params.global_batch_size)
+    logging.info('  Dataset size = %d', eval_dataset_size)
+
+    return eval_writer.filename, eval_examples, eval_features
+
  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    if params.input_path == 'dummy':
+      # Dummy training data for unit test.
      def dummy_data(_):
        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
        x = dict(
@@ -105,11 +173,16 @@ class QuestionAnsweringTask(base_task.Task):
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset

+    if params.is_training:
+      input_path = params.input_path
+    else:
+      input_path = self._tf_record_input_path
+
    batch_size = input_context.get_per_replica_batch_size(
        params.global_batch_size) if input_context else params.global_batch_size
    # TODO(chendouble): add and use nlp.data.question_answering_dataloader.
    dataset = input_pipeline.create_squad_dataset(
-        params.input_path,
+        input_path,
        params.seq_length,
        batch_size,
        is_training=params.is_training,
@@ -141,6 +214,70 @@ class QuestionAnsweringTask(base_task.Task):
        y_true=labels,  # labels has keys 'start_positions' and 'end_positions'.
        y_pred={'start_positions': start_logits, 'end_positions': end_logits})

+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    features, _ = inputs
+    unique_ids = features.pop('unique_ids')
+    model_outputs = self.inference_step(features, model)
+    start_logits, end_logits = model_outputs
+    logs = {
+        self.loss: 0.0,  # TODO(lehou): compute the real validation loss.
+        'unique_ids': unique_ids,
+        'start_logits': start_logits,
+        'end_logits': end_logits,
+    }
+    return logs
+
+  raw_aggregated_result = collections.namedtuple(
+      'RawResult', ['unique_id', 'start_logits', 'end_logits'])
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    assert step_outputs is not None, 'Got no logs from self.validation_step.'
+    if state is None:
+      state = []
+
+    for unique_ids, start_logits, end_logits in zip(
+        step_outputs['unique_ids'],
+        step_outputs['start_logits'],
+        step_outputs['end_logits']):
+      u_ids, s_logits, e_logits = (
+          unique_ids.numpy(), start_logits.numpy(), end_logits.numpy())
+      if u_ids.size == 1:
+        u_ids = [u_ids]
+        s_logits = [s_logits]
+        e_logits = [e_logits]
+      for values in zip(u_ids, s_logits, e_logits):
+        state.append(self.raw_aggregated_result(
+            unique_id=values[0],
+            start_logits=values[1].tolist(),
+            end_logits=values[2].tolist()))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs):
+    all_predictions, _, scores_diff = (
+        self.squad_lib.postprocess_output(
+            self._eval_examples,
+            self._eval_features,
+            aggregated_logs,
+            self.task_config.n_best_size,
+            self.task_config.max_answer_length,
+            self.task_config.validation_data.do_lower_case,
+            version_2_with_negative=(
+                self.task_config.validation_data.version_2_with_negative),
+            null_score_diff_threshold=(
+                self.task_config.null_score_diff_threshold),
+            verbose=False))
+
+    with tf.io.gfile.GFile(
+        self.task_config.validation_data.input_path, 'r') as reader:
+      dataset_json = json.load(reader)
+      pred_dataset = dataset_json['data']
+    if self.task_config.validation_data.version_2_with_negative:
+      eval_metrics = squad_evaluate_v2_0.evaluate(
+          pred_dataset, all_predictions, scores_diff)
+    else:
+      eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
+    return eval_metrics
+
  def initialize(self, model):
    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
    ckpt_dir_or_file = self.task_config.init_checkpoint
@@ -150,7 +287,7 @@ class QuestionAnsweringTask(base_task.Task):
      return

    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
-    status = ckpt.restore(ckpt_dir_or_file)
+    status = ckpt.read(ckpt_dir_or_file)
    status.expect_partial().assert_existing_objects_matched()
    logging.info('finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
--- a/official/nlp/tasks/question_answering_test.py
+++ b/official/nlp/tasks/question_answering_test.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for official.nlp.tasks.question_answering."""
-import functools
+import itertools
+import json
 import os
+from absl.testing import parameterized
 import tensorflow as tf

 from official.nlp.bert import configs
@@ -25,30 +27,67 @@ from official.nlp.configs import encoders
 from official.nlp.tasks import question_answering


-class QuestionAnsweringTaskTest(tf.test.TestCase):
+class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):

  def setUp(self):
    super(QuestionAnsweringTaskTest, self).setUp()
    self._encoder_config = encoders.TransformerEncoderConfig(
        vocab_size=30522, num_layers=1)
    self._train_data_config = bert.QADataConfig(
-        input_path="dummy", seq_length=128, global_batch_size=1)
+        input_path="dummy",
+        seq_length=128,
+        global_batch_size=1)
+
+    val_data = {"version": "1.1",
+                "data": [{"paragraphs": [
+                    {"context": "Sky is blue.",
+                     "qas": [{"question": "What is blue?", "id": "1234",
+                              "answers": [{"text": "Sky", "answer_start": 0},
+                                          {"text": "Sky", "answer_start": 0},
+                                          {"text": "Sky", "answer_start": 0}]
+                              }]}]}]}
+    self._val_input_path = os.path.join(self.get_temp_dir(), "val_data.json")
+    with tf.io.gfile.GFile(self._val_input_path, "w") as writer:
+      writer.write(json.dumps(val_data, indent=4) + "\n")
+
+    self._test_vocab = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with tf.io.gfile.GFile(self._test_vocab, "w") as writer:
+      writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")
+
+  def _get_validation_data_config(self, version_2_with_negative=False):
+    return bert.QADevDataConfig(
+        input_path=self._val_input_path,
+        input_preprocessed_data_path=self.get_temp_dir(),
+        seq_length=128,
+        global_batch_size=1,
+        version_2_with_negative=version_2_with_negative,
+        vocab_file=self._test_vocab,
+        tokenization="WordPiece",
+        do_lower_case=True)

  def _run_task(self, config):
    task = question_answering.QuestionAnsweringTask(config)
    model = task.build_model()
    metrics = task.build_metrics()
+    task.initialize(model)

-    strategy = tf.distribute.get_strategy()
-    dataset = strategy.experimental_distribute_datasets_from_function(
-        functools.partial(task.build_inputs, config.train_data))
-
-    iterator = iter(dataset)
+    train_dataset = task.build_inputs(config.train_data)
+    train_iterator = iter(train_dataset)
    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    task.validation_step(next(iterator), model, metrics=metrics)
-
-  def test_task(self):
+    task.train_step(next(train_iterator), model, optimizer, metrics=metrics)
+
+    val_dataset = task.build_inputs(config.validation_data)
+    val_iterator = iter(val_dataset)
+    logs = task.validation_step(next(val_iterator), model, metrics=metrics)
+    logs = task.aggregate_logs(step_outputs=logs)
+    metrics = task.reduce_aggregated_logs(logs)
+    self.assertIn("final_f1", metrics)
+
+  @parameterized.parameters(itertools.product(
+      (False, True),
+      ("WordPiece", "SentencePiece"),
+  ))
+  def test_task(self, version_2_with_negative, tokenization):
    # Saves a checkpoint.
    pretrain_cfg = bert.BertPretrainerConfig(
        encoder=self._encoder_config,
@@ -65,22 +104,16 @@ class QuestionAnsweringTaskTest(tf.test.TestCase):
    config = question_answering.QuestionAnsweringConfig(
        init_checkpoint=saved_path,
        model=self._encoder_config,
-        train_data=self._train_data_config)
-    task = question_answering.QuestionAnsweringTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-    dataset = task.build_inputs(config.train_data)
-
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    task.validation_step(next(iterator), model, metrics=metrics)
-    task.initialize(model)
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config(
+            version_2_with_negative))
+    self._run_task(config)

  def test_task_with_fit(self):
    config = question_answering.QuestionAnsweringConfig(
        model=self._encoder_config,
-        train_data=self._train_data_config)
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config())
    task = question_answering.QuestionAnsweringTask(config)
    model = task.build_model()
    model = task.compile_model(
@@ -122,7 +155,8 @@ class QuestionAnsweringTaskTest(tf.test.TestCase):
    config = question_answering.QuestionAnsweringConfig(
        hub_module_url=hub_module_url,
        model=self._encoder_config,
-        train_data=self._train_data_config)
+        train_data=self._train_data_config,
+        validation_data=self._get_validation_data_config())
    self._run_task(config)