Merge branch 'master' of https://github.com/tensorflow/models into context_tf2

3ce2f61b · Kaushik Shivakumar · bb16d5ca · 8e9296ff · 3ce2f61b · 3ce2f61b
Commit 3ce2f61b authored Jul 12, 2020 by Kaushik Shivakumar
20 changed files
--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -17,8 +17,8 @@
 Includes configurations and instantiation methods.
 """
 import dataclasses
+import gin
 import tensorflow as tf
 from official.modeling import tf_utils
@@ -42,10 +42,43 @@ class TransformerEncoderConfig(base_config.Config):
  initializer_range: float = 0.02
-def instantiate_encoder_from_cfg(
+@gin.configurable
-    config: TransformerEncoderConfig) -> networks.TransformerEncoder:
+def instantiate_encoder_from_cfg(config: TransformerEncoderConfig,
+                                 encoder_cls=networks.TransformerEncoder):
  """Instantiate a Transformer encoder network from TransformerEncoderConfig."""
-  encoder_network = networks.TransformerEncoder(
+  if encoder_cls.__name__ == "EncoderScaffold":
+    embedding_cfg = dict(
+        vocab_size=config.vocab_size,
+        type_vocab_size=config.type_vocab_size,
+        hidden_size=config.hidden_size,
+        seq_length=None,
+        max_seq_length=config.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=config.initializer_range),
+        dropout_rate=config.dropout_rate,
+    )
+    hidden_cfg = dict(
+        num_attention_heads=config.num_attention_heads,
+        intermediate_size=config.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(
+            config.hidden_activation),
+        dropout_rate=config.dropout_rate,
+        attention_dropout_rate=config.attention_dropout_rate,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=config.initializer_range),
+    )
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=config.num_layers,
+        pooled_output_dim=config.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=config.initializer_range))
+    return encoder_cls(**kwargs)
+  if encoder_cls.__name__ != "TransformerEncoder":
+    raise ValueError("Unknown encoder network class. %s" % str(encoder_cls))
+  encoder_network = encoder_cls(
      vocab_size=config.vocab_size,
      hidden_size=config.hidden_size,
      num_layers=config.num_layers,

--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -31,7 +31,7 @@ from official.nlp.bert import tokenization
 class InputExample(object):
-  """A single training/test example for simple sequence classification."""
+  """A single training/test example for simple seq regression/classification."""
  def __init__(self,
               guid,
@@ -48,8 +48,9 @@ class InputExample(object):
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
-      label: (Optional) string. The label of the example. This should be
+      label: (Optional) string for classification, float for regression. The
-        specified for train and dev examples, but not for test examples.
+        label of the example. This should be specified for train and dev
+        examples, but not for test examples.
      weight: (Optional) float. The weight of the example to be used during
        training.
      int_iden: (Optional) int. The int identification number of example in the
@@ -84,10 +85,12 @@ class InputFeatures(object):
 class DataProcessor(object):
-  """Base class for data converters for sequence classification data sets."""
+  """Base class for converters for seq regression/classification datasets."""
  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
    self.process_text_fn = process_text_fn
+    self.is_regression = False
+    self.label_type = None
  def get_train_examples(self, data_dir):
    """Gets a collection of `InputExample`s for the train set."""
@@ -121,76 +124,70 @@ class DataProcessor(object):
      return lines
-class XnliProcessor(DataProcessor):
+class ColaProcessor(DataProcessor):
-  """Processor for the XNLI data set."""
+  """Processor for the CoLA data set (GLUE version)."""
-  supported_languages = [
-      "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr",
-      "ur", "vi", "zh"
-  ]
-  def __init__(self,
-               language="en",
-               process_text_fn=tokenization.convert_to_unicode):
-    super(XnliProcessor, self).__init__(process_text_fn)
-    if language == "all":
-      self.languages = XnliProcessor.supported_languages
-    elif language not in XnliProcessor.supported_languages:
-      raise ValueError("language %s is not supported for XNLI task." % language)
-    else:
-      self.languages = [language]
  def get_train_examples(self, data_dir):
    """See base class."""
-    lines = []
+    return self._create_examples(
-    for language in self.languages:
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-      # Skips the header.
-      lines.extend(
-          self._read_tsv(
-              os.path.join(data_dir, "multinli",
-                           "multinli.train.%s.tsv" % language))[1:])
-    examples = []
-    for (i, line) in enumerate(lines):
-      guid = "train-%d" % i
-      text_a = self.process_text_fn(line[0])
-      text_b = self.process_text_fn(line[1])
-      label = self.process_text_fn(line[2])
-      if label == self.process_text_fn("contradictory"):
-        label = self.process_text_fn("contradiction")
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
  def get_dev_examples(self, data_dir):
    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "COLA"
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
-      if i == 0:
+      # Only the test set has a header
+      if set_type == "test" and i == 0:
        continue
-      guid = "dev-%d" % i
+      guid = "%s-%s" % (set_type, i)
-      text_a = self.process_text_fn(line[6])
+      if set_type == "test":
-      text_b = self.process_text_fn(line[7])
+        text_a = self.process_text_fn(line[1])
-      label = self.process_text_fn(line[1])
+        label = "0"
+      else:
+        text_a = self.process_text_fn(line[3])
+        label = self.process_text_fn(line[1])
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples
+class MnliProcessor(DataProcessor):
+  """Processor for the MultiNLI data set (GLUE version)."""
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+        "dev_matched")
  def get_test_examples(self, data_dir):
    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
+    return self._create_examples(
-    examples_by_lang = {k: [] for k in XnliProcessor.supported_languages}
+        self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
-    for (i, line) in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "test-%d" % i
-      language = self.process_text_fn(line[0])
-      text_a = self.process_text_fn(line[6])
-      text_b = self.process_text_fn(line[7])
-      label = self.process_text_fn(line[1])
-      examples_by_lang[language].append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples_by_lang
  def get_labels(self):
    """See base class."""
@@ -199,65 +196,69 @@ class XnliProcessor(DataProcessor):
  @staticmethod
  def get_processor_name():
    """See base class."""
-    return "XNLI"
+    return "MNLI"
-class XtremeXnliProcessor(DataProcessor):
-  """Processor for the XTREME XNLI data set."""
-  supported_languages = [
-      "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr",
-      "ur", "vi", "zh"
-  ]
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv"))
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
-      guid = "train-%d" % i
+      if i == 0:
-      text_a = self.process_text_fn(line[0])
+        continue
-      text_b = self.process_text_fn(line[1])
+      guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
-      label = self.process_text_fn(line[2])
+      text_a = self.process_text_fn(line[8])
+      text_b = self.process_text_fn(line[9])
+      if set_type == "test":
+        label = "contradiction"
+      else:
+        label = self.process_text_fn(line[-1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples
+class MrpcProcessor(DataProcessor):
+  """Processor for the MRPC data set (GLUE version)."""
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
  def get_dev_examples(self, data_dir):
    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv"))
+    return self._create_examples(
-    examples = []
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    for (i, line) in enumerate(lines):
-      guid = "dev-%d" % i
-      text_a = self.process_text_fn(line[0])
-      text_b = self.process_text_fn(line[1])
-      label = self.process_text_fn(line[2])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
  def get_test_examples(self, data_dir):
    """See base class."""
-    examples_by_lang = {k: [] for k in self.supported_languages}
+    return self._create_examples(
-    for lang in self.supported_languages:
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-      lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv"))
-      for (i, line) in enumerate(lines):
-        guid = f"test-{i}"
-        text_a = self.process_text_fn(line[0])
-        text_b = self.process_text_fn(line[1])
-        label = "contradiction"
-        examples_by_lang[lang].append(
-            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples_by_lang
  def get_labels(self):
    """See base class."""
-    return ["contradiction", "entailment", "neutral"]
+    return ["0", "1"]
  @staticmethod
  def get_processor_name():
    """See base class."""
-    return "XTREME-XNLI"
+    return "MRPC"
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "%s-%s" % (set_type, i)
+      text_a = self.process_text_fn(line[3])
+      text_b = self.process_text_fn(line[4])
+      if set_type == "test":
+        label = "0"
+      else:
+        label = self.process_text_fn(line[0])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
 class PawsxProcessor(DataProcessor):
@@ -339,154 +340,8 @@ class PawsxProcessor(DataProcessor):
    return "XTREME-PAWS-X"
-class XtremePawsxProcessor(DataProcessor):
+class QnliProcessor(DataProcessor):
-  """Processor for the XTREME PAWS-X data set."""
+  """Processor for the QNLI data set (GLUE version)."""
-  supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"]
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv"))
-    examples = []
-    for (i, line) in enumerate(lines):
-      guid = "train-%d" % i
-      text_a = self.process_text_fn(line[0])
-      text_b = self.process_text_fn(line[1])
-      label = self.process_text_fn(line[2])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv"))
-    examples = []
-    for (i, line) in enumerate(lines):
-      guid = "dev-%d" % i
-      text_a = self.process_text_fn(line[0])
-      text_b = self.process_text_fn(line[1])
-      label = self.process_text_fn(line[2])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    examples_by_lang = {k: [] for k in self.supported_languages}
-    for lang in self.supported_languages:
-      lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv"))
-      for (i, line) in enumerate(lines):
-        guid = "test-%d" % i
-        text_a = self.process_text_fn(line[0])
-        text_b = self.process_text_fn(line[1])
-        label = "0"
-        examples_by_lang[lang].append(
-            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples_by_lang
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "XTREME-PAWS-X"
-class MnliProcessor(DataProcessor):
-  """Processor for the MultiNLI data set (GLUE version)."""
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-        "dev_matched")
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
-  def get_labels(self):
-    """See base class."""
-    return ["contradiction", "entailment", "neutral"]
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "MNLI"
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
-      text_a = self.process_text_fn(line[8])
-      text_b = self.process_text_fn(line[9])
-      if set_type == "test":
-        label = "contradiction"
-      else:
-        label = self.process_text_fn(line[-1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-class MrpcProcessor(DataProcessor):
-  """Processor for the MRPC data set (GLUE version)."""
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "MRPC"
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      text_a = self.process_text_fn(line[3])
-      text_b = self.process_text_fn(line[4])
-      if set_type == "test":
-        label = "0"
-      else:
-        label = self.process_text_fn(line[0])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-class QqpProcessor(DataProcessor):
-  """Processor for the QQP data set (GLUE version)."""
  def get_train_examples(self, data_dir):
    """See base class."""
@@ -496,7 +351,7 @@ class QqpProcessor(DataProcessor):
  def get_dev_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
  def get_test_examples(self, data_dir):
    """See base class."""
@@ -505,12 +360,12 @@ class QqpProcessor(DataProcessor):
  def get_labels(self):
    """See base class."""
-    return ["0", "1"]
+    return ["entailment", "not_entailment"]
  @staticmethod
  def get_processor_name():
    """See base class."""
-    return "QQP"
+    return "QNLI"
  def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
@@ -518,20 +373,22 @@ class QqpProcessor(DataProcessor):
    for (i, line) in enumerate(lines):
      if i == 0:
        continue
-      guid = "%s-%s" % (set_type, line[0])
+      guid = "%s-%s" % (set_type, 1)
-      try:
+      if set_type == "test":
-        text_a = line[3]
+        text_a = tokenization.convert_to_unicode(line[1])
-        text_b = line[4]
+        text_b = tokenization.convert_to_unicode(line[2])
-        label = line[5]
+        label = "entailment"
-      except IndexError:
+      else:
-        continue
+        text_a = tokenization.convert_to_unicode(line[1])
+        text_b = tokenization.convert_to_unicode(line[2])
+        label = tokenization.convert_to_unicode(line[-1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples
-class ColaProcessor(DataProcessor):
+class QqpProcessor(DataProcessor):
-  """Processor for the CoLA data set (GLUE version)."""
+  """Processor for the QQP data set (GLUE version)."""
  def get_train_examples(self, data_dir):
    """See base class."""
@@ -555,24 +412,23 @@ class ColaProcessor(DataProcessor):
  @staticmethod
  def get_processor_name():
    """See base class."""
-    return "COLA"
+    return "QQP"
  def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
-      # Only the test set has a header
+      if i == 0:
-      if set_type == "test" and i == 0:
+        continue
+      guid = "%s-%s" % (set_type, line[0])
+      try:
+        text_a = line[3]
+        text_b = line[4]
+        label = line[5]
+      except IndexError:
        continue
-      guid = "%s-%s" % (set_type, i)
-      if set_type == "test":
-        text_a = self.process_text_fn(line[1])
-        label = "0"
-      else:
-        text_a = self.process_text_fn(line[3])
-        label = self.process_text_fn(line[1])
      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples
@@ -668,8 +524,14 @@ class SstProcessor(DataProcessor):
    return examples
-class QnliProcessor(DataProcessor):
+class StsBProcessor(DataProcessor):
-  """Processor for the QNLI data set (GLUE version)."""
+  """Processor for the STS-B data set (GLUE version)."""
+  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
+    super(StsBProcessor, self).__init__(process_text_fn=process_text_fn)
+    self.is_regression = True
+    self.label_type = float
+    self._labels = None
  def get_train_examples(self, data_dir):
    """See base class."""
@@ -679,7 +541,7 @@ class QnliProcessor(DataProcessor):
  def get_dev_examples(self, data_dir):
    """See base class."""
    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
  def get_test_examples(self, data_dir):
    """See base class."""
@@ -688,28 +550,26 @@ class QnliProcessor(DataProcessor):
  def get_labels(self):
    """See base class."""
-    return ["entailment", "not_entailment"]
+    return self._labels
  @staticmethod
  def get_processor_name():
    """See base class."""
-    return "QNLI"
+    return "STS-B"
  def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
      if i == 0:
        continue
-      guid = "%s-%s" % (set_type, 1)
+      guid = "%s-%s" % (set_type, i)
+      text_a = tokenization.convert_to_unicode(line[7])
+      text_b = tokenization.convert_to_unicode(line[8])
      if set_type == "test":
-        text_a = tokenization.convert_to_unicode(line[1])
+        label = 0.0
-        text_b = tokenization.convert_to_unicode(line[2])
-        label = "entailment"
      else:
-        text_a = tokenization.convert_to_unicode(line[1])
+        label = self.label_type(tokenization.convert_to_unicode(line[9]))
-        text_b = tokenization.convert_to_unicode(line[2])
-        label = tokenization.convert_to_unicode(line[-1])
      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples
@@ -729,6 +589,8 @@ class TfdsProcessor(DataProcessor):
    tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2"
    tfds_params="dataset=glue/stsb,text_key=sentence1,text_b_key=sentence2,"
                "is_regression=true,label_type=float"
+    tfds_params="dataset=snli,text_key=premise,text_b_key=hypothesis,"
+                "skip_label=-1"
  Possible parameters (please refer to the documentation of Tensorflow Datasets
  (TFDS) for the meaning of individual parameters):
    dataset: Required dataset name (potentially with subset and version number).
@@ -746,6 +608,7 @@ class TfdsProcessor(DataProcessor):
    label_type: Type of the label key (defaults to `int`).
    weight_key: Key of the float sample weight (is not used if not provided).
    is_regression: Whether the task is a regression problem (defaults to False).
+    skip_label: Skip examples with given label (defaults to None).
  """
  def __init__(self,
@@ -785,6 +648,9 @@ class TfdsProcessor(DataProcessor):
    self.label_type = dtype_map[d.get("label_type", "int")]
    self.is_regression = cast_str_to_bool(d.get("is_regression", "False"))
    self.weight_key = d.get("weight_key", None)
+    self.skip_label = d.get("skip_label", None)
+    if self.skip_label is not None:
+      self.skip_label = self.label_type(self.skip_label)
  def get_train_examples(self, data_dir):
    assert data_dir is None
@@ -823,6 +689,8 @@ class TfdsProcessor(DataProcessor):
        if self.text_b_key:
          text_b = self.process_text_fn(example[self.text_b_key])
        label = self.label_type(example[self.label_key])
+        if self.skip_label is not None and label == self.skip_label:
+          continue
      if self.weight_key:
        weight = float(example[self.weight_key])
      examples.append(
@@ -880,6 +748,200 @@ class WnliProcessor(DataProcessor):
    return examples
+class XnliProcessor(DataProcessor):
+  """Processor for the XNLI data set."""
+  supported_languages = [
+      "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr",
+      "ur", "vi", "zh"
+  ]
+  def __init__(self,
+               language="en",
+               process_text_fn=tokenization.convert_to_unicode):
+    super(XnliProcessor, self).__init__(process_text_fn)
+    if language == "all":
+      self.languages = XnliProcessor.supported_languages
+    elif language not in XnliProcessor.supported_languages:
+      raise ValueError("language %s is not supported for XNLI task." % language)
+    else:
+      self.languages = [language]
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    lines = []
+    for language in self.languages:
+      # Skips the header.
+      lines.extend(
+          self._read_tsv(
+              os.path.join(data_dir, "multinli",
+                           "multinli.train.%s.tsv" % language))[1:])
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "train-%d" % i
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      if label == self.process_text_fn("contradictory"):
+        label = self.process_text_fn("contradiction")
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "dev-%d" % i
+      text_a = self.process_text_fn(line[6])
+      text_b = self.process_text_fn(line[7])
+      label = self.process_text_fn(line[1])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
+    examples_by_lang = {k: [] for k in XnliProcessor.supported_languages}
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "test-%d" % i
+      language = self.process_text_fn(line[0])
+      text_a = self.process_text_fn(line[6])
+      text_b = self.process_text_fn(line[7])
+      label = self.process_text_fn(line[1])
+      examples_by_lang[language].append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples_by_lang
+  def get_labels(self):
+    """See base class."""
+    return ["contradiction", "entailment", "neutral"]
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "XNLI"
+class XtremePawsxProcessor(DataProcessor):
+  """Processor for the XTREME PAWS-X data set."""
+  supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"]
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "train-%d" % i
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "dev-%d" % i
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    examples_by_lang = {k: [] for k in self.supported_languages}
+    for lang in self.supported_languages:
+      lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv"))
+      for (i, line) in enumerate(lines):
+        guid = "test-%d" % i
+        text_a = self.process_text_fn(line[0])
+        text_b = self.process_text_fn(line[1])
+        label = "0"
+        examples_by_lang[lang].append(
+            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples_by_lang
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "XTREME-PAWS-X"
+class XtremeXnliProcessor(DataProcessor):
+  """Processor for the XTREME XNLI data set."""
+  supported_languages = [
+      "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr",
+      "ur", "vi", "zh"
+  ]
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "train-%d" % i
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      guid = "dev-%d" % i
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    examples_by_lang = {k: [] for k in self.supported_languages}
+    for lang in self.supported_languages:
+      lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv"))
+      for (i, line) in enumerate(lines):
+        guid = f"test-{i}"
+        text_a = self.process_text_fn(line[0])
+        text_b = self.process_text_fn(line[1])
+        label = "contradiction"
+        examples_by_lang[lang].append(
+            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples_by_lang
+  def get_labels(self):
+    """See base class."""
+    return ["contradiction", "entailment", "neutral"]
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "XTREME-XNLI"
 def convert_single_example(ex_index, example, label_list, max_seq_length,
                           tokenizer):
  """Converts a single `InputExample` into a single `InputFeatures`."""

--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -50,8 +50,9 @@ flags.DEFINE_string(
    "for the task.")
 flags.DEFINE_enum("classification_task_name", "MNLI",
-                  ["COLA", "MNLI", "MRPC", "QNLI", "QQP", "SST-2", "XNLI",
+                  ["COLA", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
-                   "PAWS-X", "XTREME-XNLI", "XTREME-PAWS-X"],
+                   "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI",
+                   "XTREME-PAWS-X"],
                  "The name of the task to train BERT classifier. The "
                  "difference between XTREME-XNLI and XNLI is: 1. the format "
                  "of input tsv files; 2. the dev set for XTREME is english "
@@ -187,6 +188,8 @@ def generate_classifier_dataset():
        "rte": classifier_data_lib.RteProcessor,
        "sst-2":
            classifier_data_lib.SstProcessor,
+        "sts-b":
+            classifier_data_lib.StsBProcessor,
        "xnli":
            functools.partial(classifier_data_lib.XnliProcessor,
                              language=FLAGS.xnli_language),

--- a/official/nlp/data/data_loader_factory.py
+++ b/official/nlp/data/data_loader_factory.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A global factory to access NLP registered data loaders."""
+from official.utils import registry
+_REGISTERED_DATA_LOADER_CLS = {}
+def register_data_loader_cls(data_config_cls):
+  """Decorates a factory of DataLoader for lookup by a subclass of DataConfig.
+  This decorator supports registration of data loaders as follows:
+  ```
+  @dataclasses.dataclass
+  class MyDataConfig(DataConfig):
+    # Add fields here.
+    pass
+  @register_data_loader_cls(MyDataConfig)
+  class MyDataLoader:
+    # Inherits def __init__(self, data_config).
+    pass
+  my_data_config = MyDataConfig()
+  # Returns MyDataLoader(my_data_config).
+  my_loader = get_data_loader(my_data_config)
+  ```
+  Args:
+    data_config_cls: a subclass of DataConfig (*not* an instance
+      of DataConfig).
+  Returns:
+    A callable for use as class decorator that registers the decorated class
+      for creation from an instance of data_config_cls.
+  """
+  return registry.register(_REGISTERED_DATA_LOADER_CLS, data_config_cls)
+def get_data_loader(data_config):
+  """Creates a data_loader from data_config."""
+  return registry.lookup(_REGISTERED_DATA_LOADER_CLS, data_config.__class__)(
+      data_config)
--- a/official/nlp/data/pretrain_dataloader.py
+++ b/official/nlp/data/pretrain_dataloader.py
@@ -16,11 +16,27 @@
 """Loads dataset for the BERT pretraining task."""
 from typing import Mapping, Optional
+import dataclasses
 import tensorflow as tf
 from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.data import data_loader_factory
+@dataclasses.dataclass
+class BertPretrainDataConfig(cfg.DataConfig):
+  """Data config for BERT pretraining task (tasks/masked_lm)."""
+  input_path: str = ''
+  global_batch_size: int = 512
+  is_training: bool = True
+  seq_length: int = 512
+  max_predictions_per_seq: int = 76
+  use_next_sentence_label: bool = True
+  use_position_id: bool = False
+@data_loader_factory.register_data_loader_cls(BertPretrainDataConfig)
 class BertPretrainDataLoader:
  """A class to load dataset for bert pretraining task."""
@@ -91,7 +107,5 @@ class BertPretrainDataLoader:
  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
    """Returns a tf.dataset.Dataset."""
    reader = input_reader.InputReader(
-        params=self._params,
+        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
-        decoder_fn=self._decode,
-        parser_fn=self._parse)
    return reader.read(input_context)
--- a/official/nlp/data/question_answering_dataloader.py
+++ b/official/nlp/data/question_answering_dataloader.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Loads dataset for the question answering (e.g, SQuAD) task."""
+from typing import Mapping, Optional
+import dataclasses
+import tensorflow as tf
+from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.data import data_loader_factory
+@dataclasses.dataclass
+class QADataConfig(cfg.DataConfig):
+  """Data config for question answering task (tasks/question_answering)."""
+  input_path: str = ''
+  global_batch_size: int = 48
+  is_training: bool = True
+  seq_length: int = 384
+  # Settings below are question answering specific.
+  version_2_with_negative: bool = False
+  # Settings below are only used for eval mode.
+  input_preprocessed_data_path: str = ''
+  doc_stride: int = 128
+  query_length: int = 64
+  vocab_file: str = ''
+  tokenization: str = 'WordPiece'  # WordPiece or SentencePiece
+  do_lower_case: bool = True
+@data_loader_factory.register_data_loader_cls(QADataConfig)
+class QuestionAnsweringDataLoader:
+  """A class to load dataset for sentence prediction (classification) task."""
+  def __init__(self, params):
+    self._params = params
+    self._seq_length = params.seq_length
+    self._is_training = params.is_training
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+    }
+    if self._is_training:
+      name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+    else:
+      name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+    example = tf.io.parse_single_example(record, name_to_features)
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+    return example
+  def _parse(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    x, y = {}, {}
+    for name, tensor in record.items():
+      if name in ('start_positions', 'end_positions'):
+        y[name] = tensor
+      elif name == 'input_ids':
+        x['input_word_ids'] = tensor
+      elif name == 'segment_ids':
+        x['input_type_ids'] = tensor
+      else:
+        x[name] = tensor
+    return (x, y)
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
+    return reader.read(input_context)
--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
@@ -15,11 +15,24 @@
 # ==============================================================================
 """Loads dataset for the sentence prediction (classification) task."""
 from typing import Mapping, Optional
+import dataclasses
 import tensorflow as tf
 from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.data import data_loader_factory
+@dataclasses.dataclass
+class SentencePredictionDataConfig(cfg.DataConfig):
+  """Data config for sentence prediction task (tasks/sentence_prediction)."""
+  input_path: str = ''
+  global_batch_size: int = 32
+  is_training: bool = True
+  seq_length: int = 128
+@data_loader_factory.register_data_loader_cls(SentencePredictionDataConfig)
 class SentencePredictionDataLoader:
  """A class to load dataset for sentence prediction (classification) task."""

--- a/official/nlp/data/tagging_data_loader.py
+++ b/official/nlp/data/tagging_data_loader.py
@@ -15,17 +15,30 @@
 # ==============================================================================
 """Loads dataset for the tagging (e.g., NER/POS) task."""
 from typing import Mapping, Optional
+import dataclasses
 import tensorflow as tf
 from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.data import data_loader_factory
+@dataclasses.dataclass
+class TaggingDataConfig(cfg.DataConfig):
+  """Data config for tagging (tasks/tagging)."""
+  is_training: bool = True
+  seq_length: int = 128
+  include_sentence_id: bool = False
+@data_loader_factory.register_data_loader_cls(TaggingDataConfig)
 class TaggingDataLoader:
  """A class to load dataset for tagging (e.g., NER and POS) task."""
-  def __init__(self, params):
+  def __init__(self, params: TaggingDataConfig):
    self._params = params
    self._seq_length = params.seq_length
+    self._include_sentence_id = params.include_sentence_id
  def _decode(self, record: tf.Tensor):
    """Decodes a serialized tf.Example."""
@@ -35,6 +48,9 @@ class TaggingDataLoader:
        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
        'label_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
    }
+    if self._include_sentence_id:
+      name_to_features['sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
    example = tf.io.parse_single_example(record, name_to_features)
    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
@@ -54,6 +70,8 @@ class TaggingDataLoader:
        'input_mask': record['input_mask'],
        'input_type_ids': record['segment_ids']
    }
+    if self._include_sentence_id:
+      x['sentence_id'] = record['sentence_id']
    y = record['label_ids']
    return (x, y)

--- a/official/nlp/modeling/README.md
+++ b/official/nlp/modeling/README.md
 # NLP Modeling Library
-This libary provides a set of Keras primitives (Layers, Networks, and Models)
+This library provides a set of Keras primitives (Layers, Networks, and Models)
 that can be assembled into transformer-based models. They are
 flexible, validated, interoperable, and both TF1 and TF2 compatible.
@@ -16,6 +16,11 @@ standardized configuration.
 * [`losses`](losses) contains common loss computation used in NLP tasks.
+Please see the colab
+[nlp_modeling_library_intro.ipynb]
+(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb)
+for how to build transformer-based NLP models using above primitives.
 Besides the pre-defined primitives, it also provides scaffold classes to allow
 easy experimentation with noval achitectures, e.g., you don’t need to fork a whole Transformer object to try a different kind of attention primitive, for instance.
@@ -33,11 +38,9 @@ embedding subnetwork (which will replace the standard embedding logic) and/or a
 custom hidden layer (which will replace the Transformer instantiation in the
 encoder).
-BERT and ALBERT models in this repo are implemented using this library. Code examples can be found in the corresponding model folder.
+Please see the colab
+[customize_encoder.ipynb]
+(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb)
+for how to use scaffold classes to build noval achitectures.
+BERT and ALBERT models in this repo are implemented using this library. Code examples can be found in the corresponding model folder.
--- a/official/nlp/modeling/layers/README.md
+++ b/official/nlp/modeling/layers/README.md
@@ -3,11 +3,6 @@
 Layers are the fundamental building blocks for NLP models. They can be used to
 assemble new layers, networks, or models.
-*   [DenseEinsum](dense_einsum.py) implements a feedforward network using
-    tf.einsum. This layer contains the einsum op, the associated weight, and the
-    logic required to generate the einsum expression for the given
-    initialization parameters.
 *   [MultiHeadAttention](attention.py) implements an optionally masked attention
    between query, key, value tensors as described in
    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). If

--- a/official/nlp/modeling/layers/dense_einsum.py
+++ b/official/nlp/modeling/layers/dense_einsum.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import tensorflow as tf
+from tensorflow.python.util import deprecation
 _CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
@@ -57,6 +59,9 @@ class DenseEinsum(tf.keras.layers.Layer):
      `(batch_size, units)`.
  """
+  @deprecation.deprecated(
+      None, "DenseEinsum is deprecated. Please use "
+      "tf.keras.experimental.EinsumDense layer instead.")
  def __init__(self,
               output_shape,
               num_summed_dimensions=1,

--- a/official/nlp/modeling/layers/multi_channel_attention.py
+++ b/official/nlp/modeling/layers/multi_channel_attention.py
@@ -26,7 +26,6 @@ import math
 import tensorflow as tf
 from official.modeling import tf_utils
 from official.nlp.modeling.layers import attention
-from official.nlp.modeling.layers import dense_einsum
 from official.nlp.modeling.layers import masked_softmax
@@ -67,28 +66,26 @@ class VotingAttention(tf.keras.layers.Layer):
    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
  def build(self, unused_input_shapes):
-    self._query_dense = dense_einsum.DenseEinsum(
+    common_kwargs = dict(
-        output_shape=(self._num_heads, self._head_size),
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
+        bias_constraint=self._bias_constraint)
-        dtype=self.dtype,
+    self._query_dense = tf.keras.layers.experimental.EinsumDense(
-        name="encdocatt_query")
+        "BAE,ENH->BANH",
-    self._key_dense = dense_einsum.DenseEinsum(
+        output_shape=(None, self._num_heads, self._head_size),
-        output_shape=(self._num_heads, self._head_size),
+        bias_axes="NH",
-        kernel_initializer=self._kernel_initializer,
+        name="query",
-        bias_initializer=self._bias_initializer,
+        **common_kwargs)
-        kernel_regularizer=self._kernel_regularizer,
+    self._key_dense = tf.keras.layers.experimental.EinsumDense(
-        bias_regularizer=self._bias_regularizer,
+        "BAE,ENH->BANH",
-        activity_regularizer=self._activity_regularizer,
+        output_shape=(None, self._num_heads, self._head_size),
-        kernel_constraint=self._kernel_constraint,
+        bias_axes="NH",
-        bias_constraint=self._bias_constraint,
+        name="key",
-        dtype=self.dtype,
+        **common_kwargs)
-        name="encdocatt_key")
    super(VotingAttention, self).build(unused_input_shapes)
  def call(self, encoder_outputs, doc_attention_mask):

--- a/official/nlp/modeling/layers/rezero_transformer.py
+++ b/official/nlp/modeling/layers/rezero_transformer.py
@@ -23,7 +23,6 @@ import gin
 import tensorflow as tf
 from official.nlp.modeling.layers import attention
-from official.nlp.modeling.layers import dense_einsum
 @tf.keras.utils.register_keras_serializable(package="Text")
@@ -109,19 +108,20 @@ class ReZeroTransformer(tf.keras.layers.Layer):
          "The input size (%d) is not a multiple of the number of attention "
          "heads (%d)" % (hidden_size, self._num_heads))
    self._attention_head_size = int(hidden_size // self._num_heads)
+    common_kwargs = dict(
-    self._attention_layer = attention.MultiHeadAttention(
-        num_heads=self._num_heads,
-        key_size=self._attention_head_size,
-        dropout=self._attention_dropout_rate,
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
+        bias_constraint=self._bias_constraint)
-        name="self_attention")
+    self._attention_layer = attention.MultiHeadAttention(
+        num_heads=self._num_heads,
+        key_size=self._attention_head_size,
+        dropout=self._attention_dropout_rate,
+        name="self_attention",
+        **common_kwargs)
    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    if self._use_layer_norm:
      # Use float32 in layernorm for numeric stability.
@@ -132,17 +132,12 @@ class ReZeroTransformer(tf.keras.layers.Layer):
              axis=-1,
              epsilon=1e-12,
              dtype=tf.float32))
-    self._intermediate_dense = dense_einsum.DenseEinsum(
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
-        output_shape=self._intermediate_size,
+        "abc,cd->abd",
-        activation=None,
+        output_shape=(None, self._intermediate_size),
-        kernel_initializer=self._kernel_initializer,
+        bias_axes="d",
-        bias_initializer=self._bias_initializer,
+        name="intermediate",
-        kernel_regularizer=self._kernel_regularizer,
+        **common_kwargs)
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="intermediate")
    policy = tf.keras.mixed_precision.experimental.global_policy()
    if policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
@@ -151,16 +146,12 @@ class ReZeroTransformer(tf.keras.layers.Layer):
      policy = tf.float32
    self._intermediate_activation_layer = tf.keras.layers.Activation(
        self._intermediate_activation, dtype=policy)
-    self._output_dense = dense_einsum.DenseEinsum(
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
-        output_shape=hidden_size,
+        "abc,cd->abd",
-        kernel_initializer=self._kernel_initializer,
+        output_shape=(None, hidden_size),
-        bias_initializer=self._bias_initializer,
+        bias_axes="d",
-        kernel_regularizer=self._kernel_regularizer,
+        name="output",
-        bias_regularizer=self._bias_regularizer,
+        **common_kwargs)
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="output")
    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    if self._use_layer_norm:
      # Use float32 in layernorm for numeric stability.

--- a/official/nlp/modeling/layers/transformer.py
+++ b/official/nlp/modeling/layers/transformer.py
@@ -23,7 +23,6 @@ import gin
 import tensorflow as tf
 from official.nlp.modeling.layers import attention
-from official.nlp.modeling.layers import dense_einsum
 from official.nlp.modeling.layers import multi_channel_attention
 from official.nlp.modeling.layers.util import tf_function_if_eager
@@ -106,19 +105,20 @@ class Transformer(tf.keras.layers.Layer):
          "The input size (%d) is not a multiple of the number of attention "
          "heads (%d)" % (hidden_size, self._num_heads))
    self._attention_head_size = int(hidden_size // self._num_heads)
+    common_kwargs = dict(
-    self._attention_layer = attention.MultiHeadAttention(
-        num_heads=self._num_heads,
-        key_size=self._attention_head_size,
-        dropout=self._attention_dropout_rate,
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
+        bias_constraint=self._bias_constraint)
-        name="self_attention")
+    self._attention_layer = attention.MultiHeadAttention(
+        num_heads=self._num_heads,
+        key_size=self._attention_head_size,
+        dropout=self._attention_dropout_rate,
+        name="self_attention",
+        **common_kwargs)
    # pylint: disable=protected-access
    self._attention_layer.build([input_tensor_shape] * 3)
    self._attention_output_dense = self._attention_layer._output_dense
@@ -132,17 +132,12 @@ class Transformer(tf.keras.layers.Layer):
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32))
-    self._intermediate_dense = dense_einsum.DenseEinsum(
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
-        output_shape=self._intermediate_size,
+        "abc,cd->abd",
-        activation=None,
+        output_shape=(None, self._intermediate_size),
-        kernel_initializer=self._kernel_initializer,
+        bias_axes="d",
-        bias_initializer=self._bias_initializer,
+        name="intermediate",
-        kernel_regularizer=self._kernel_regularizer,
+        **common_kwargs)
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="intermediate")
    policy = tf.keras.mixed_precision.experimental.global_policy()
    if policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
@@ -151,16 +146,12 @@ class Transformer(tf.keras.layers.Layer):
      policy = tf.float32
    self._intermediate_activation_layer = tf.keras.layers.Activation(
        self._intermediate_activation, dtype=policy)
-    self._output_dense = dense_einsum.DenseEinsum(
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
-        output_shape=hidden_size,
+        "abc,cd->abd",
-        kernel_initializer=self._kernel_initializer,
+        output_shape=(None, hidden_size),
-        bias_initializer=self._bias_initializer,
+        bias_axes="d",
-        kernel_regularizer=self._kernel_regularizer,
+        name="output",
-        bias_regularizer=self._bias_regularizer,
+        **common_kwargs)
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="output")
    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    # Use float32 in layernorm for numeric stability.
    self._output_layer_norm = tf.keras.layers.LayerNormalization(
@@ -312,30 +303,27 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
          "The hidden size (%d) is not a multiple of the number of attention "
          "heads (%d)" % (hidden_size, self.num_attention_heads))
    self.attention_head_size = int(hidden_size / self.num_attention_heads)
-    # Self attention.
+    common_kwargs = dict(
-    self.self_attention = attention.CachedAttention(
-        num_heads=self.num_attention_heads,
-        key_size=self.attention_head_size,
-        dropout=self.attention_dropout_rate,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="self_attention")
-    self.self_attention_output_dense = dense_einsum.DenseEinsum(
-        output_shape=hidden_size,
-        num_summed_dimensions=2,
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
+        bias_constraint=self._bias_constraint)
-        name="self_attention_output")
+    # Self attention.
+    self.self_attention = attention.CachedAttention(
+        num_heads=self.num_attention_heads,
+        key_size=self.attention_head_size,
+        dropout=self.attention_dropout_rate,
+        name="self_attention",
+        **common_kwargs)
+    self.self_attention_output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        **common_kwargs)
    self.self_attention_dropout = tf.keras.layers.Dropout(
        rate=self.dropout_rate)
    self.self_attention_layer_norm = (
@@ -347,14 +335,8 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
        key_size=self.attention_head_size,
        dropout=self.attention_dropout_rate,
        output_shape=hidden_size,
-        kernel_initializer=self._kernel_initializer,
+        name="attention/encdec",
-        bias_initializer=self._bias_initializer,
+        **common_kwargs)
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="attention/encdec")
    self.encdec_attention_dropout = tf.keras.layers.Dropout(
        rate=self.dropout_rate)
@@ -363,29 +345,20 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
            name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12))
    # Feed-forward projection.
-    self.intermediate_dense = dense_einsum.DenseEinsum(
+    self.intermediate_dense = tf.keras.layers.experimental.EinsumDense(
-        output_shape=self.intermediate_size,
+        "abc,cd->abd",
-        activation=None,
+        output_shape=(None, self.intermediate_size),
-        kernel_initializer=self._kernel_initializer,
+        bias_axes="d",
-        bias_initializer=self._bias_initializer,
+        name="intermediate",
-        kernel_regularizer=self._kernel_regularizer,
+        **common_kwargs)
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="intermediate")
    self.intermediate_activation_layer = tf.keras.layers.Activation(
        self.intermediate_activation)
-    self.output_dense = dense_einsum.DenseEinsum(
+    self.output_dense = tf.keras.layers.experimental.EinsumDense(
-        output_shape=hidden_size,
+        "abc,cd->abd",
-        kernel_initializer=self._kernel_initializer,
+        output_shape=(None, hidden_size),
-        bias_initializer=self._bias_initializer,
+        bias_axes="d",
-        kernel_regularizer=self._kernel_regularizer,
+        name="output",
-        bias_regularizer=self._bias_regularizer,
+        **common_kwargs)
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="output")
    self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
    self.output_layer_norm = tf.keras.layers.LayerNormalization(
        name="output_layer_norm", axis=-1, epsilon=1e-12)

--- a/official/nlp/modeling/models/bert_classifier.py
+++ b/official/nlp/modeling/models/bert_classifier.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
+from official.nlp.modeling import layers
 from official.nlp.modeling import networks
@@ -43,23 +44,25 @@ class BertClassifier(tf.keras.Model):
    num_classes: Number of classes to predict from the classification network.
    initializer: The initializer (if any) to use in the classification networks.
      Defaults to a Glorot uniform initializer.
-    output: The output style for this network. Can be either 'logits' or
+    dropout_rate: The dropout probability of the cls head.
-      'predictions'.
+    use_encoder_pooler: Whether to use the pooler layer pre-defined inside
+      the encoder.
  """
  def __init__(self,
               network,
               num_classes,
               initializer='glorot_uniform',
-               output='logits',
               dropout_rate=0.1,
+               use_encoder_pooler=True,
               **kwargs):
    self._self_setattr_tracking = False
+    self._network = network
    self._config = {
        'network': network,
        'num_classes': num_classes,
        'initializer': initializer,
-        'output': output,
+        'use_encoder_pooler': use_encoder_pooler,
    }
    # We want to use the inputs of the passed network as the inputs to this
@@ -67,22 +70,36 @@ class BertClassifier(tf.keras.Model):
    # when we construct the Model object at the end of init.
    inputs = network.inputs
-    # Because we have a copy of inputs to create this Model object, we can
+    if use_encoder_pooler:
-    # invoke the Network object with its own input tensors to start the Model.
+      # Because we have a copy of inputs to create this Model object, we can
-    _, cls_output = network(inputs)
+      # invoke the Network object with its own input tensors to start the Model.
-    cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
+      _, cls_output = network(inputs)
+      cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
-    self.classifier = networks.Classification(
+      self.classifier = networks.Classification(
-        input_width=cls_output.shape[-1],
+          input_width=cls_output.shape[-1],
-        num_classes=num_classes,
+          num_classes=num_classes,
-        initializer=initializer,
+          initializer=initializer,
-        output=output,
+          output='logits',
-        name='classification')
+          name='sentence_prediction')
-    predictions = self.classifier(cls_output)
+      predictions = self.classifier(cls_output)
+    else:
+      sequence_output, _ = network(inputs)
+      self.classifier = layers.ClassificationHead(
+          inner_dim=sequence_output.shape[-1],
+          num_classes=num_classes,
+          initializer=initializer,
+          dropout_rate=dropout_rate,
+          name='sentence_prediction')
+      predictions = self.classifier(sequence_output)
    super(BertClassifier, self).__init__(
        inputs=inputs, outputs=predictions, **kwargs)
+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
  def get_config(self):
    return self._config

--- a/official/nlp/modeling/models/bert_classifier_test.py
+++ b/official/nlp/modeling/models/bert_classifier_test.py
@@ -42,8 +42,7 @@ class BertClassifierTest(keras_parameterized.TestCase):
    # Create a BERT trainer with the created network.
    bert_trainer_model = bert_classifier.BertClassifier(
-        test_network,
+        test_network, num_classes=num_classes)
-        num_classes=num_classes)
    # Create a set of 2-dimensional inputs (the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -89,7 +88,7 @@ class BertClassifierTest(keras_parameterized.TestCase):
    # Create a BERT trainer with the created network. (Note that all the args
    # are different, so we can catch any serialization mismatches.)
    bert_trainer_model = bert_classifier.BertClassifier(
-        test_network, num_classes=4, initializer='zeros', output='predictions')
+        test_network, num_classes=4, initializer='zeros')
    # Create another BERT trainer via serialization and deserialization.
    config = bert_trainer_model.get_config()

--- a/official/nlp/modeling/models/bert_pretrainer.py
+++ b/official/nlp/modeling/models/bert_pretrainer.py
@@ -147,11 +147,9 @@ class BertPretrainerV2(tf.keras.Model):
  (Experimental).
  Adds the masked language model head and optional classification heads upon the
-  transformer encoder. When num_masked_tokens == 0, there won't be MaskedLM
+  transformer encoder.
-  head.
  Arguments:
-    num_masked_tokens: Number of tokens to predict from the masked LM.
    encoder_network: A transformer network. This network should output a
      sequence output and a classification output.
    mlm_activation: The activation (if any) to use in the masked LM network. If
@@ -169,7 +167,6 @@ class BertPretrainerV2(tf.keras.Model):
  def __init__(
      self,
-      num_masked_tokens: int,
      encoder_network: tf.keras.Model,
      mlm_activation=None,
      mlm_initializer='glorot_uniform',
@@ -179,7 +176,6 @@ class BertPretrainerV2(tf.keras.Model):
    self._self_setattr_tracking = False
    self._config = {
        'encoder_network': encoder_network,
-        'num_masked_tokens': num_masked_tokens,
        'mlm_initializer': mlm_initializer,
        'classification_heads': classification_heads,
        'name': name,
@@ -195,19 +191,16 @@ class BertPretrainerV2(tf.keras.Model):
      raise ValueError('Classification heads should have unique names.')
    outputs = dict()
-    if num_masked_tokens > 0:
+    self.masked_lm = layers.MaskedLM(
-      self.masked_lm = layers.MaskedLM(
+        embedding_table=self.encoder_network.get_embedding_table(),
-          embedding_table=self.encoder_network.get_embedding_table(),
+        activation=mlm_activation,
-          activation=mlm_activation,
+        initializer=mlm_initializer,
-          initializer=mlm_initializer,
+        name='cls/predictions')
-          name='cls/predictions')
+    masked_lm_positions = tf.keras.layers.Input(
-      masked_lm_positions = tf.keras.layers.Input(
+        shape=(None,), name='masked_lm_positions', dtype=tf.int32)
-          shape=(num_masked_tokens,),
+    inputs.append(masked_lm_positions)
-          name='masked_lm_positions',
+    outputs['lm_output'] = self.masked_lm(
-          dtype=tf.int32)
+        sequence_output, masked_positions=masked_lm_positions)
-      inputs.append(masked_lm_positions)
-      outputs['lm_output'] = self.masked_lm(
-          sequence_output, masked_positions=masked_lm_positions)
    for cls_head in self.classification_heads:
      outputs[cls_head.name] = cls_head(sequence_output)
@@ -217,7 +210,7 @@ class BertPretrainerV2(tf.keras.Model):
  @property
  def checkpoint_items(self):
    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(encoder=self.encoder_network)
+    items = dict(encoder=self.encoder_network, masked_lm=self.masked_lm)
    for head in self.classification_heads:
      for key, item in head.checkpoint_items.items():
        items['.'.join([head.name, key])] = item

--- a/official/nlp/modeling/models/bert_pretrainer_test.py
+++ b/official/nlp/modeling/models/bert_pretrainer_test.py
@@ -118,10 +118,9 @@ class BertPretrainerTest(keras_parameterized.TestCase):
        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)
    # Create a BERT trainer with the created network.
-    num_token_predictions = 2
    bert_trainer_model = bert_pretrainer.BertPretrainerV2(
-        encoder_network=test_network, num_masked_tokens=num_token_predictions)
+        encoder_network=test_network)
+    num_token_predictions = 20
    # Create a set of 2-dimensional inputs (the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -145,7 +144,7 @@ class BertPretrainerTest(keras_parameterized.TestCase):
    # Create a BERT trainer with the created network. (Note that all the args
    # are different, so we can catch any serialization mismatches.)
    bert_trainer_model = bert_pretrainer.BertPretrainerV2(
-        encoder_network=test_network, num_masked_tokens=2)
+        encoder_network=test_network)
    # Create another BERT trainer via serialization and deserialization.
    config = bert_trainer_model.get_config()

--- a/official/nlp/tasks/masked_lm.py
+++ b/official/nlp/tasks/masked_lm.py
@@ -14,18 +14,20 @@
 # limitations under the License.
 # ==============================================================================
 """Masked language task."""
+from absl import logging
 import dataclasses
 import tensorflow as tf
 from official.core import base_task
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import bert
-from official.nlp.data import pretrain_dataloader
+from official.nlp.data import data_loader_factory
 @dataclasses.dataclass
 class MaskedLMConfig(cfg.TaskConfig):
  """The model config."""
+  init_checkpoint: str = ''
  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(cls_heads=[
      bert.ClsHeadConfig(
          inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence')
@@ -39,7 +41,7 @@ class MaskedLMTask(base_task.Task):
  """Mock task object for testing."""
  def build_model(self):
-    return bert.instantiate_bertpretrainer_from_cfg(self.task_config.model)
+    return bert.instantiate_pretrainer_from_cfg(self.task_config.model)
  def build_losses(self,
                   labels,
@@ -60,10 +62,10 @@ class MaskedLMTask(base_task.Task):
      sentence_labels = labels['next_sentence_labels']
      sentence_outputs = tf.cast(
          model_outputs['next_sentence'], dtype=tf.float32)
-      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+      sentence_loss = tf.reduce_mean(
-          sentence_labels,
+          tf.keras.losses.sparse_categorical_crossentropy(sentence_labels,
-          sentence_outputs,
+                                                          sentence_outputs,
-          from_logits=True)
+                                                          from_logits=True))
      metrics['next_sentence_loss'].update_state(sentence_loss)
      total_loss = mlm_loss + sentence_loss
    else:
@@ -95,8 +97,7 @@ class MaskedLMTask(base_task.Task):
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset
-    return pretrain_dataloader.BertPretrainDataLoader(params).load(
+    return data_loader_factory.get_data_loader(params).load(input_context)
-        input_context)
  def build_metrics(self, training=None):
    del training
@@ -172,3 +173,17 @@ class MaskedLMTask(base_task.Task):
        aux_losses=model.losses)
    self.process_metrics(metrics, inputs, outputs)
    return {self.loss: loss}
+  def initialize(self, model: tf.keras.Model):
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+    # Restoring all modules defined by the model, e.g. encoder, masked_lm and
+    # cls pooler. The best initialization may vary case by case.
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    status = ckpt.read(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
--- a/official/nlp/tasks/masked_lm_test.py
+++ b/official/nlp/tasks/masked_lm_test.py
@@ -19,6 +19,7 @@ import tensorflow as tf
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
 from official.nlp.tasks import masked_lm
@@ -26,14 +27,14 @@ class MLMTaskTest(tf.test.TestCase):
  def test_task(self):
    config = masked_lm.MaskedLMConfig(
+        init_checkpoint=self.get_temp_dir(),
        model=bert.BertPretrainerConfig(
            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
-            num_masked_tokens=20,
            cls_heads=[
                bert.ClsHeadConfig(
                    inner_dim=10, num_classes=2, name="next_sentence")
            ]),
-        train_data=bert.BertPretrainDataConfig(
+        train_data=pretrain_dataloader.BertPretrainDataConfig(
            input_path="dummy",
            max_predictions_per_seq=20,
            seq_length=128,
@@ -48,6 +49,12 @@ class MLMTaskTest(tf.test.TestCase):
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)
+    # Saves a checkpoint.
+    ckpt = tf.train.Checkpoint(
+        model=model, **model.checkpoint_items)
+    ckpt.save(config.init_checkpoint)
+    task.initialize(model)
 if __name__ == "__main__":
  tf.test.main()