Merge branch 'master' of https://github.com/tensorflow/models into context_tf2

3ce2f61b · Kaushik Shivakumar · bb16d5ca · 8e9296ff · 3ce2f61b · 3ce2f61b
Commit 3ce2f61b authored Jul 12, 2020 by Kaushik Shivakumar
20 changed files
--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -17,8 +17,8 @@

 Includes configurations and instantiation methods.
 """
-
 import dataclasses
+import gin
 import tensorflow as tf

 from official.modeling import tf_utils
@@ -42,10 +42,43 @@ class TransformerEncoderConfig(base_config.Config):
  initializer_range: float = 0.02


-def instantiate_encoder_from_cfg(
-    config: TransformerEncoderConfig) -> networks.TransformerEncoder:
+@gin.configurable
+def instantiate_encoder_from_cfg(config: TransformerEncoderConfig,
+                                 encoder_cls=networks.TransformerEncoder):
  """Instantiate a Transformer encoder network from TransformerEncoderConfig."""
-  encoder_network = networks.TransformerEncoder(
+  if encoder_cls.__name__ == "EncoderScaffold":
+    embedding_cfg = dict(
+        vocab_size=config.vocab_size,
+        type_vocab_size=config.type_vocab_size,
+        hidden_size=config.hidden_size,
+        seq_length=None,
+        max_seq_length=config.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=config.initializer_range),
+        dropout_rate=config.dropout_rate,
+    )
+    hidden_cfg = dict(
+        num_attention_heads=config.num_attention_heads,
+        intermediate_size=config.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(
+            config.hidden_activation),
+        dropout_rate=config.dropout_rate,
+        attention_dropout_rate=config.attention_dropout_rate,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=config.initializer_range),
+    )
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=config.num_layers,
+        pooled_output_dim=config.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=config.initializer_range))
+    return encoder_cls(**kwargs)
+
+  if encoder_cls.__name__ != "TransformerEncoder":
+    raise ValueError("Unknown encoder network class. %s" % str(encoder_cls))
+  encoder_network = encoder_cls(
      vocab_size=config.vocab_size,
      hidden_size=config.hidden_size,
      num_layers=config.num_layers,

--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -50,8 +50,9 @@ flags.DEFINE_string(
    "for the task.")

 flags.DEFINE_enum("classification_task_name", "MNLI",
-                  ["COLA", "MNLI", "MRPC", "QNLI", "QQP", "SST-2", "XNLI",
-                   "PAWS-X", "XTREME-XNLI", "XTREME-PAWS-X"],
+                  ["COLA", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
+                   "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI",
+                   "XTREME-PAWS-X"],
                  "The name of the task to train BERT classifier. The "
                  "difference between XTREME-XNLI and XNLI is: 1. the format "
                  "of input tsv files; 2. the dev set for XTREME is english "
@@ -187,6 +188,8 @@ def generate_classifier_dataset():
        "rte": classifier_data_lib.RteProcessor,
        "sst-2":
            classifier_data_lib.SstProcessor,
+        "sts-b":
+            classifier_data_lib.StsBProcessor,
        "xnli":
            functools.partial(classifier_data_lib.XnliProcessor,
                              language=FLAGS.xnli_language),

--- a/official/nlp/data/data_loader_factory.py
+++ b/official/nlp/data/data_loader_factory.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A global factory to access NLP registered data loaders."""
+
+from official.utils import registry
+
+_REGISTERED_DATA_LOADER_CLS = {}
+
+
+def register_data_loader_cls(data_config_cls):
+  """Decorates a factory of DataLoader for lookup by a subclass of DataConfig.
+
+  This decorator supports registration of data loaders as follows:
+
+  ```
+  @dataclasses.dataclass
+  class MyDataConfig(DataConfig):
+    # Add fields here.
+    pass
+
+  @register_data_loader_cls(MyDataConfig)
+  class MyDataLoader:
+    # Inherits def __init__(self, data_config).
+    pass
+
+  my_data_config = MyDataConfig()
+
+  # Returns MyDataLoader(my_data_config).
+  my_loader = get_data_loader(my_data_config)
+  ```
+
+  Args:
+    data_config_cls: a subclass of DataConfig (*not* an instance
+      of DataConfig).
+
+  Returns:
+    A callable for use as class decorator that registers the decorated class
+      for creation from an instance of data_config_cls.
+  """
+  return registry.register(_REGISTERED_DATA_LOADER_CLS, data_config_cls)
+
+
+def get_data_loader(data_config):
+  """Creates a data_loader from data_config."""
+  return registry.lookup(_REGISTERED_DATA_LOADER_CLS, data_config.__class__)(
+      data_config)
--- a/official/nlp/data/pretrain_dataloader.py
+++ b/official/nlp/data/pretrain_dataloader.py
@@ -16,11 +16,27 @@
 """Loads dataset for the BERT pretraining task."""
 from typing import Mapping, Optional

+import dataclasses
 import tensorflow as tf

 from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.data import data_loader_factory


+@dataclasses.dataclass
+class BertPretrainDataConfig(cfg.DataConfig):
+  """Data config for BERT pretraining task (tasks/masked_lm)."""
+  input_path: str = ''
+  global_batch_size: int = 512
+  is_training: bool = True
+  seq_length: int = 512
+  max_predictions_per_seq: int = 76
+  use_next_sentence_label: bool = True
+  use_position_id: bool = False
+
+
+@data_loader_factory.register_data_loader_cls(BertPretrainDataConfig)
 class BertPretrainDataLoader:
  """A class to load dataset for bert pretraining task."""

@@ -91,7 +107,5 @@ class BertPretrainDataLoader:
  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
    """Returns a tf.dataset.Dataset."""
    reader = input_reader.InputReader(
-        params=self._params,
-        decoder_fn=self._decode,
-        parser_fn=self._parse)
+        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
    return reader.read(input_context)
--- a/official/nlp/data/question_answering_dataloader.py
+++ b/official/nlp/data/question_answering_dataloader.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Loads dataset for the question answering (e.g, SQuAD) task."""
+from typing import Mapping, Optional
+import dataclasses
+import tensorflow as tf
+
+from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.data import data_loader_factory
+
+
+@dataclasses.dataclass
+class QADataConfig(cfg.DataConfig):
+  """Data config for question answering task (tasks/question_answering)."""
+  input_path: str = ''
+  global_batch_size: int = 48
+  is_training: bool = True
+  seq_length: int = 384
+  # Settings below are question answering specific.
+  version_2_with_negative: bool = False
+  # Settings below are only used for eval mode.
+  input_preprocessed_data_path: str = ''
+  doc_stride: int = 128
+  query_length: int = 64
+  vocab_file: str = ''
+  tokenization: str = 'WordPiece'  # WordPiece or SentencePiece
+  do_lower_case: bool = True
+
+
+@data_loader_factory.register_data_loader_cls(QADataConfig)
+class QuestionAnsweringDataLoader:
+  """A class to load dataset for sentence prediction (classification) task."""
+
+  def __init__(self, params):
+    self._params = params
+    self._seq_length = params.seq_length
+    self._is_training = params.is_training
+
+  def _decode(self, record: tf.Tensor):
+    """Decodes a serialized tf.Example."""
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
+    }
+    if self._is_training:
+      name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+    else:
+      name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+    example = tf.io.parse_single_example(record, name_to_features)
+
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in example:
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+
+    return example
+
+  def _parse(self, record: Mapping[str, tf.Tensor]):
+    """Parses raw tensors into a dict of tensors to be consumed by the model."""
+    x, y = {}, {}
+    for name, tensor in record.items():
+      if name in ('start_positions', 'end_positions'):
+        y[name] = tensor
+      elif name == 'input_ids':
+        x['input_word_ids'] = tensor
+      elif name == 'segment_ids':
+        x['input_type_ids'] = tensor
+      else:
+        x[name] = tensor
+    return (x, y)
+
+  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a tf.dataset.Dataset."""
+    reader = input_reader.InputReader(
+        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
+    return reader.read(input_context)
--- a/official/nlp/data/sentence_prediction_dataloader.py
+++ b/official/nlp/data/sentence_prediction_dataloader.py
@@ -15,11 +15,24 @@
 # ==============================================================================
 """Loads dataset for the sentence prediction (classification) task."""
 from typing import Mapping, Optional
+import dataclasses
 import tensorflow as tf

 from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.data import data_loader_factory


+@dataclasses.dataclass
+class SentencePredictionDataConfig(cfg.DataConfig):
+  """Data config for sentence prediction task (tasks/sentence_prediction)."""
+  input_path: str = ''
+  global_batch_size: int = 32
+  is_training: bool = True
+  seq_length: int = 128
+
+
+@data_loader_factory.register_data_loader_cls(SentencePredictionDataConfig)
 class SentencePredictionDataLoader:
  """A class to load dataset for sentence prediction (classification) task."""


--- a/official/nlp/data/tagging_data_loader.py
+++ b/official/nlp/data/tagging_data_loader.py
@@ -15,17 +15,30 @@
 # ==============================================================================
 """Loads dataset for the tagging (e.g., NER/POS) task."""
 from typing import Mapping, Optional
+import dataclasses
 import tensorflow as tf

 from official.core import input_reader
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.data import data_loader_factory


+@dataclasses.dataclass
+class TaggingDataConfig(cfg.DataConfig):
+  """Data config for tagging (tasks/tagging)."""
+  is_training: bool = True
+  seq_length: int = 128
+  include_sentence_id: bool = False
+
+
+@data_loader_factory.register_data_loader_cls(TaggingDataConfig)
 class TaggingDataLoader:
  """A class to load dataset for tagging (e.g., NER and POS) task."""

-  def __init__(self, params):
+  def __init__(self, params: TaggingDataConfig):
    self._params = params
    self._seq_length = params.seq_length
+    self._include_sentence_id = params.include_sentence_id

  def _decode(self, record: tf.Tensor):
    """Decodes a serialized tf.Example."""
@@ -35,6 +48,9 @@ class TaggingDataLoader:
        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
        'label_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
    }
+    if self._include_sentence_id:
+      name_to_features['sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
+
    example = tf.io.parse_single_example(record, name_to_features)

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
@@ -54,6 +70,8 @@ class TaggingDataLoader:
        'input_mask': record['input_mask'],
        'input_type_ids': record['segment_ids']
    }
+    if self._include_sentence_id:
+      x['sentence_id'] = record['sentence_id']
    y = record['label_ids']
    return (x, y)


--- a/official/nlp/modeling/README.md
+++ b/official/nlp/modeling/README.md
 # NLP Modeling Library

-This libary provides a set of Keras primitives (Layers, Networks, and Models)
+This library provides a set of Keras primitives (Layers, Networks, and Models)
 that can be assembled into transformer-based models. They are
 flexible, validated, interoperable, and both TF1 and TF2 compatible.

@@ -16,6 +16,11 @@ standardized configuration.

 * [`losses`](losses) contains common loss computation used in NLP tasks.

+Please see the colab
+[nlp_modeling_library_intro.ipynb]
+(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb)
+for how to build transformer-based NLP models using above primitives.
+
 Besides the pre-defined primitives, it also provides scaffold classes to allow
 easy experimentation with noval achitectures, e.g., you don’t need to fork a whole Transformer object to try a different kind of attention primitive, for instance.

@@ -33,11 +38,9 @@ embedding subnetwork (which will replace the standard embedding logic) and/or a
 custom hidden layer (which will replace the Transformer instantiation in the
 encoder).

-BERT and ALBERT models in this repo are implemented using this library. Code examples can be found in the corresponding model folder.
-
-
-
-
-
-
+Please see the colab
+[customize_encoder.ipynb]
+(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb)
+for how to use scaffold classes to build noval achitectures.

+BERT and ALBERT models in this repo are implemented using this library. Code examples can be found in the corresponding model folder.
--- a/official/nlp/modeling/layers/README.md
+++ b/official/nlp/modeling/layers/README.md
@@ -3,11 +3,6 @@
 Layers are the fundamental building blocks for NLP models. They can be used to
 assemble new layers, networks, or models.

-*   [DenseEinsum](dense_einsum.py) implements a feedforward network using
-    tf.einsum. This layer contains the einsum op, the associated weight, and the
-    logic required to generate the einsum expression for the given
-    initialization parameters.
-
 *   [MultiHeadAttention](attention.py) implements an optionally masked attention
    between query, key, value tensors as described in
    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). If

--- a/official/nlp/modeling/layers/dense_einsum.py
+++ b/official/nlp/modeling/layers/dense_einsum.py
@@ -21,6 +21,8 @@ from __future__ import print_function

 import tensorflow as tf

+from tensorflow.python.util import deprecation
+
 _CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]


@@ -57,6 +59,9 @@ class DenseEinsum(tf.keras.layers.Layer):
      `(batch_size, units)`.
  """

+  @deprecation.deprecated(
+      None, "DenseEinsum is deprecated. Please use "
+      "tf.keras.experimental.EinsumDense layer instead.")
  def __init__(self,
               output_shape,
               num_summed_dimensions=1,

--- a/official/nlp/modeling/layers/multi_channel_attention.py
+++ b/official/nlp/modeling/layers/multi_channel_attention.py
@@ -26,7 +26,6 @@ import math
 import tensorflow as tf
 from official.modeling import tf_utils
 from official.nlp.modeling.layers import attention
-from official.nlp.modeling.layers import dense_einsum
 from official.nlp.modeling.layers import masked_softmax


@@ -67,28 +66,26 @@ class VotingAttention(tf.keras.layers.Layer):
    self._bias_constraint = tf.keras.constraints.get(bias_constraint)

  def build(self, unused_input_shapes):
-    self._query_dense = dense_einsum.DenseEinsum(
-        output_shape=(self._num_heads, self._head_size),
+    common_kwargs = dict(
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        dtype=self.dtype,
-        name="encdocatt_query")
-    self._key_dense = dense_einsum.DenseEinsum(
-        output_shape=(self._num_heads, self._head_size),
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        dtype=self.dtype,
-        name="encdocatt_key")
+        bias_constraint=self._bias_constraint)
+    self._query_dense = tf.keras.layers.experimental.EinsumDense(
+        "BAE,ENH->BANH",
+        output_shape=(None, self._num_heads, self._head_size),
+        bias_axes="NH",
+        name="query",
+        **common_kwargs)
+    self._key_dense = tf.keras.layers.experimental.EinsumDense(
+        "BAE,ENH->BANH",
+        output_shape=(None, self._num_heads, self._head_size),
+        bias_axes="NH",
+        name="key",
+        **common_kwargs)
    super(VotingAttention, self).build(unused_input_shapes)

  def call(self, encoder_outputs, doc_attention_mask):

--- a/official/nlp/modeling/layers/rezero_transformer.py
+++ b/official/nlp/modeling/layers/rezero_transformer.py
@@ -23,7 +23,6 @@ import gin
 import tensorflow as tf

 from official.nlp.modeling.layers import attention
-from official.nlp.modeling.layers import dense_einsum


 @tf.keras.utils.register_keras_serializable(package="Text")
@@ -109,19 +108,20 @@ class ReZeroTransformer(tf.keras.layers.Layer):
          "The input size (%d) is not a multiple of the number of attention "
          "heads (%d)" % (hidden_size, self._num_heads))
    self._attention_head_size = int(hidden_size // self._num_heads)
-
-    self._attention_layer = attention.MultiHeadAttention(
-        num_heads=self._num_heads,
-        key_size=self._attention_head_size,
-        dropout=self._attention_dropout_rate,
+    common_kwargs = dict(
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="self_attention")
+        bias_constraint=self._bias_constraint)
+    self._attention_layer = attention.MultiHeadAttention(
+        num_heads=self._num_heads,
+        key_size=self._attention_head_size,
+        dropout=self._attention_dropout_rate,
+        name="self_attention",
+        **common_kwargs)
    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    if self._use_layer_norm:
      # Use float32 in layernorm for numeric stability.
@@ -132,17 +132,12 @@ class ReZeroTransformer(tf.keras.layers.Layer):
              axis=-1,
              epsilon=1e-12,
              dtype=tf.float32))
-    self._intermediate_dense = dense_einsum.DenseEinsum(
-        output_shape=self._intermediate_size,
-        activation=None,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="intermediate")
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, self._intermediate_size),
+        bias_axes="d",
+        name="intermediate",
+        **common_kwargs)
    policy = tf.keras.mixed_precision.experimental.global_policy()
    if policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
@@ -151,16 +146,12 @@ class ReZeroTransformer(tf.keras.layers.Layer):
      policy = tf.float32
    self._intermediate_activation_layer = tf.keras.layers.Activation(
        self._intermediate_activation, dtype=policy)
-    self._output_dense = dense_einsum.DenseEinsum(
-        output_shape=hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="output")
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        **common_kwargs)
    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    if self._use_layer_norm:
      # Use float32 in layernorm for numeric stability.

--- a/official/nlp/modeling/layers/transformer.py
+++ b/official/nlp/modeling/layers/transformer.py
@@ -23,7 +23,6 @@ import gin
 import tensorflow as tf

 from official.nlp.modeling.layers import attention
-from official.nlp.modeling.layers import dense_einsum
 from official.nlp.modeling.layers import multi_channel_attention
 from official.nlp.modeling.layers.util import tf_function_if_eager

@@ -106,19 +105,20 @@ class Transformer(tf.keras.layers.Layer):
          "The input size (%d) is not a multiple of the number of attention "
          "heads (%d)" % (hidden_size, self._num_heads))
    self._attention_head_size = int(hidden_size // self._num_heads)
-
-    self._attention_layer = attention.MultiHeadAttention(
-        num_heads=self._num_heads,
-        key_size=self._attention_head_size,
-        dropout=self._attention_dropout_rate,
+    common_kwargs = dict(
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="self_attention")
+        bias_constraint=self._bias_constraint)
+    self._attention_layer = attention.MultiHeadAttention(
+        num_heads=self._num_heads,
+        key_size=self._attention_head_size,
+        dropout=self._attention_dropout_rate,
+        name="self_attention",
+        **common_kwargs)
    # pylint: disable=protected-access
    self._attention_layer.build([input_tensor_shape] * 3)
    self._attention_output_dense = self._attention_layer._output_dense
@@ -132,17 +132,12 @@ class Transformer(tf.keras.layers.Layer):
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32))
-    self._intermediate_dense = dense_einsum.DenseEinsum(
-        output_shape=self._intermediate_size,
-        activation=None,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="intermediate")
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, self._intermediate_size),
+        bias_axes="d",
+        name="intermediate",
+        **common_kwargs)
    policy = tf.keras.mixed_precision.experimental.global_policy()
    if policy.name == "mixed_bfloat16":
      # bfloat16 causes BERT with the LAMB optimizer to not converge
@@ -151,16 +146,12 @@ class Transformer(tf.keras.layers.Layer):
      policy = tf.float32
    self._intermediate_activation_layer = tf.keras.layers.Activation(
        self._intermediate_activation, dtype=policy)
-    self._output_dense = dense_einsum.DenseEinsum(
-        output_shape=hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="output")
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        **common_kwargs)
    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    # Use float32 in layernorm for numeric stability.
    self._output_layer_norm = tf.keras.layers.LayerNormalization(
@@ -312,30 +303,27 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
          "The hidden size (%d) is not a multiple of the number of attention "
          "heads (%d)" % (hidden_size, self.num_attention_heads))
    self.attention_head_size = int(hidden_size / self.num_attention_heads)
-    # Self attention.
-    self.self_attention = attention.CachedAttention(
-        num_heads=self.num_attention_heads,
-        key_size=self.attention_head_size,
-        dropout=self.attention_dropout_rate,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="self_attention")
-    self.self_attention_output_dense = dense_einsum.DenseEinsum(
-        output_shape=hidden_size,
-        num_summed_dimensions=2,
+    common_kwargs = dict(
        kernel_initializer=self._kernel_initializer,
        bias_initializer=self._bias_initializer,
        kernel_regularizer=self._kernel_regularizer,
        bias_regularizer=self._bias_regularizer,
        activity_regularizer=self._activity_regularizer,
        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="self_attention_output")
+        bias_constraint=self._bias_constraint)
+    # Self attention.
+    self.self_attention = attention.CachedAttention(
+        num_heads=self.num_attention_heads,
+        key_size=self.attention_head_size,
+        dropout=self.attention_dropout_rate,
+        name="self_attention",
+        **common_kwargs)
+    self.self_attention_output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        **common_kwargs)
    self.self_attention_dropout = tf.keras.layers.Dropout(
        rate=self.dropout_rate)
    self.self_attention_layer_norm = (
@@ -347,14 +335,8 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
        key_size=self.attention_head_size,
        dropout=self.attention_dropout_rate,
        output_shape=hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="attention/encdec")
+        name="attention/encdec",
+        **common_kwargs)

    self.encdec_attention_dropout = tf.keras.layers.Dropout(
        rate=self.dropout_rate)
@@ -363,29 +345,20 @@ class TransformerDecoderLayer(tf.keras.layers.Layer):
            name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12))

    # Feed-forward projection.
-    self.intermediate_dense = dense_einsum.DenseEinsum(
-        output_shape=self.intermediate_size,
-        activation=None,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="intermediate")
+    self.intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, self.intermediate_size),
+        bias_axes="d",
+        name="intermediate",
+        **common_kwargs)
    self.intermediate_activation_layer = tf.keras.layers.Activation(
        self.intermediate_activation)
-    self.output_dense = dense_einsum.DenseEinsum(
-        output_shape=hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="output")
+    self.output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        **common_kwargs)
    self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
    self.output_layer_norm = tf.keras.layers.LayerNormalization(
        name="output_layer_norm", axis=-1, epsilon=1e-12)

--- a/official/nlp/modeling/models/bert_classifier.py
+++ b/official/nlp/modeling/models/bert_classifier.py
@@ -21,6 +21,7 @@ from __future__ import print_function

 import tensorflow as tf

+from official.nlp.modeling import layers
 from official.nlp.modeling import networks


@@ -43,23 +44,25 @@ class BertClassifier(tf.keras.Model):
    num_classes: Number of classes to predict from the classification network.
    initializer: The initializer (if any) to use in the classification networks.
      Defaults to a Glorot uniform initializer.
-    output: The output style for this network. Can be either 'logits' or
-      'predictions'.
+    dropout_rate: The dropout probability of the cls head.
+    use_encoder_pooler: Whether to use the pooler layer pre-defined inside
+      the encoder.
  """

  def __init__(self,
               network,
               num_classes,
               initializer='glorot_uniform',
-               output='logits',
               dropout_rate=0.1,
+               use_encoder_pooler=True,
               **kwargs):
    self._self_setattr_tracking = False
+    self._network = network
    self._config = {
        'network': network,
        'num_classes': num_classes,
        'initializer': initializer,
-        'output': output,
+        'use_encoder_pooler': use_encoder_pooler,
    }

    # We want to use the inputs of the passed network as the inputs to this
@@ -67,22 +70,36 @@ class BertClassifier(tf.keras.Model):
    # when we construct the Model object at the end of init.
    inputs = network.inputs

-    # Because we have a copy of inputs to create this Model object, we can
-    # invoke the Network object with its own input tensors to start the Model.
-    _, cls_output = network(inputs)
-    cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
+    if use_encoder_pooler:
+      # Because we have a copy of inputs to create this Model object, we can
+      # invoke the Network object with its own input tensors to start the Model.
+      _, cls_output = network(inputs)
+      cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)

-    self.classifier = networks.Classification(
-        input_width=cls_output.shape[-1],
-        num_classes=num_classes,
-        initializer=initializer,
-        output=output,
-        name='classification')
-    predictions = self.classifier(cls_output)
+      self.classifier = networks.Classification(
+          input_width=cls_output.shape[-1],
+          num_classes=num_classes,
+          initializer=initializer,
+          output='logits',
+          name='sentence_prediction')
+      predictions = self.classifier(cls_output)
+    else:
+      sequence_output, _ = network(inputs)
+      self.classifier = layers.ClassificationHead(
+          inner_dim=sequence_output.shape[-1],
+          num_classes=num_classes,
+          initializer=initializer,
+          dropout_rate=dropout_rate,
+          name='sentence_prediction')
+      predictions = self.classifier(sequence_output)

    super(BertClassifier, self).__init__(
        inputs=inputs, outputs=predictions, **kwargs)

+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
+
  def get_config(self):
    return self._config


--- a/official/nlp/modeling/models/bert_classifier_test.py
+++ b/official/nlp/modeling/models/bert_classifier_test.py
@@ -42,8 +42,7 @@ class BertClassifierTest(keras_parameterized.TestCase):

    # Create a BERT trainer with the created network.
    bert_trainer_model = bert_classifier.BertClassifier(
-        test_network,
-        num_classes=num_classes)
+        test_network, num_classes=num_classes)

    # Create a set of 2-dimensional inputs (the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -89,7 +88,7 @@ class BertClassifierTest(keras_parameterized.TestCase):
    # Create a BERT trainer with the created network. (Note that all the args
    # are different, so we can catch any serialization mismatches.)
    bert_trainer_model = bert_classifier.BertClassifier(
-        test_network, num_classes=4, initializer='zeros', output='predictions')
+        test_network, num_classes=4, initializer='zeros')

    # Create another BERT trainer via serialization and deserialization.
    config = bert_trainer_model.get_config()

--- a/official/nlp/modeling/models/bert_pretrainer.py
+++ b/official/nlp/modeling/models/bert_pretrainer.py
@@ -147,11 +147,9 @@ class BertPretrainerV2(tf.keras.Model):

  (Experimental).
  Adds the masked language model head and optional classification heads upon the
-  transformer encoder. When num_masked_tokens == 0, there won't be MaskedLM
-  head.
+  transformer encoder.

  Arguments:
-    num_masked_tokens: Number of tokens to predict from the masked LM.
    encoder_network: A transformer network. This network should output a
      sequence output and a classification output.
    mlm_activation: The activation (if any) to use in the masked LM network. If
@@ -169,7 +167,6 @@ class BertPretrainerV2(tf.keras.Model):

  def __init__(
      self,
-      num_masked_tokens: int,
      encoder_network: tf.keras.Model,
      mlm_activation=None,
      mlm_initializer='glorot_uniform',
@@ -179,7 +176,6 @@ class BertPretrainerV2(tf.keras.Model):
    self._self_setattr_tracking = False
    self._config = {
        'encoder_network': encoder_network,
-        'num_masked_tokens': num_masked_tokens,
        'mlm_initializer': mlm_initializer,
        'classification_heads': classification_heads,
        'name': name,
@@ -195,19 +191,16 @@ class BertPretrainerV2(tf.keras.Model):
      raise ValueError('Classification heads should have unique names.')

    outputs = dict()
-    if num_masked_tokens > 0:
-      self.masked_lm = layers.MaskedLM(
-          embedding_table=self.encoder_network.get_embedding_table(),
-          activation=mlm_activation,
-          initializer=mlm_initializer,
-          name='cls/predictions')
-      masked_lm_positions = tf.keras.layers.Input(
-          shape=(num_masked_tokens,),
-          name='masked_lm_positions',
-          dtype=tf.int32)
-      inputs.append(masked_lm_positions)
-      outputs['lm_output'] = self.masked_lm(
-          sequence_output, masked_positions=masked_lm_positions)
+    self.masked_lm = layers.MaskedLM(
+        embedding_table=self.encoder_network.get_embedding_table(),
+        activation=mlm_activation,
+        initializer=mlm_initializer,
+        name='cls/predictions')
+    masked_lm_positions = tf.keras.layers.Input(
+        shape=(None,), name='masked_lm_positions', dtype=tf.int32)
+    inputs.append(masked_lm_positions)
+    outputs['lm_output'] = self.masked_lm(
+        sequence_output, masked_positions=masked_lm_positions)
    for cls_head in self.classification_heads:
      outputs[cls_head.name] = cls_head(sequence_output)

@@ -217,7 +210,7 @@ class BertPretrainerV2(tf.keras.Model):
  @property
  def checkpoint_items(self):
    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(encoder=self.encoder_network)
+    items = dict(encoder=self.encoder_network, masked_lm=self.masked_lm)
    for head in self.classification_heads:
      for key, item in head.checkpoint_items.items():
        items['.'.join([head.name, key])] = item

--- a/official/nlp/modeling/models/bert_pretrainer_test.py
+++ b/official/nlp/modeling/models/bert_pretrainer_test.py
@@ -118,10 +118,9 @@ class BertPretrainerTest(keras_parameterized.TestCase):
        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)

    # Create a BERT trainer with the created network.
-    num_token_predictions = 2
    bert_trainer_model = bert_pretrainer.BertPretrainerV2(
-        encoder_network=test_network, num_masked_tokens=num_token_predictions)
-
+        encoder_network=test_network)
+    num_token_predictions = 20
    # Create a set of 2-dimensional inputs (the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -145,7 +144,7 @@ class BertPretrainerTest(keras_parameterized.TestCase):
    # Create a BERT trainer with the created network. (Note that all the args
    # are different, so we can catch any serialization mismatches.)
    bert_trainer_model = bert_pretrainer.BertPretrainerV2(
-        encoder_network=test_network, num_masked_tokens=2)
+        encoder_network=test_network)

    # Create another BERT trainer via serialization and deserialization.
    config = bert_trainer_model.get_config()

--- a/official/nlp/tasks/masked_lm.py
+++ b/official/nlp/tasks/masked_lm.py
@@ -14,18 +14,20 @@
 # limitations under the License.
 # ==============================================================================
 """Masked language task."""
+from absl import logging
 import dataclasses
 import tensorflow as tf

 from official.core import base_task
 from official.modeling.hyperparams import config_definitions as cfg
 from official.nlp.configs import bert
-from official.nlp.data import pretrain_dataloader
+from official.nlp.data import data_loader_factory


 @dataclasses.dataclass
 class MaskedLMConfig(cfg.TaskConfig):
  """The model config."""
+  init_checkpoint: str = ''
  model: bert.BertPretrainerConfig = bert.BertPretrainerConfig(cls_heads=[
      bert.ClsHeadConfig(
          inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence')
@@ -39,7 +41,7 @@ class MaskedLMTask(base_task.Task):
  """Mock task object for testing."""

  def build_model(self):
-    return bert.instantiate_bertpretrainer_from_cfg(self.task_config.model)
+    return bert.instantiate_pretrainer_from_cfg(self.task_config.model)

  def build_losses(self,
                   labels,
@@ -60,10 +62,10 @@ class MaskedLMTask(base_task.Task):
      sentence_labels = labels['next_sentence_labels']
      sentence_outputs = tf.cast(
          model_outputs['next_sentence'], dtype=tf.float32)
-      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
-          sentence_labels,
-          sentence_outputs,
-          from_logits=True)
+      sentence_loss = tf.reduce_mean(
+          tf.keras.losses.sparse_categorical_crossentropy(sentence_labels,
+                                                          sentence_outputs,
+                                                          from_logits=True))
      metrics['next_sentence_loss'].update_state(sentence_loss)
      total_loss = mlm_loss + sentence_loss
    else:
@@ -95,8 +97,7 @@ class MaskedLMTask(base_task.Task):
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset

-    return pretrain_dataloader.BertPretrainDataLoader(params).load(
-        input_context)
+    return data_loader_factory.get_data_loader(params).load(input_context)

  def build_metrics(self, training=None):
    del training
@@ -172,3 +173,17 @@ class MaskedLMTask(base_task.Task):
        aux_losses=model.losses)
    self.process_metrics(metrics, inputs, outputs)
    return {self.loss: loss}
+
+  def initialize(self, model: tf.keras.Model):
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+    # Restoring all modules defined by the model, e.g. encoder, masked_lm and
+    # cls pooler. The best initialization may vary case by case.
+    ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+    status = ckpt.read(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
--- a/official/nlp/tasks/masked_lm_test.py
+++ b/official/nlp/tasks/masked_lm_test.py
@@ -19,6 +19,7 @@ import tensorflow as tf

 from official.nlp.configs import bert
 from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
 from official.nlp.tasks import masked_lm


@@ -26,14 +27,14 @@ class MLMTaskTest(tf.test.TestCase):

  def test_task(self):
    config = masked_lm.MaskedLMConfig(
+        init_checkpoint=self.get_temp_dir(),
        model=bert.BertPretrainerConfig(
            encoders.TransformerEncoderConfig(vocab_size=30522, num_layers=1),
-            num_masked_tokens=20,
            cls_heads=[
                bert.ClsHeadConfig(
                    inner_dim=10, num_classes=2, name="next_sentence")
            ]),
-        train_data=bert.BertPretrainDataConfig(
+        train_data=pretrain_dataloader.BertPretrainDataConfig(
            input_path="dummy",
            max_predictions_per_seq=20,
            seq_length=128,
@@ -48,6 +49,12 @@ class MLMTaskTest(tf.test.TestCase):
    task.train_step(next(iterator), model, optimizer, metrics=metrics)
    task.validation_step(next(iterator), model, metrics=metrics)

+    # Saves a checkpoint.
+    ckpt = tf.train.Checkpoint(
+        model=model, **model.checkpoint_items)
+    ckpt.save(config.init_checkpoint)
+    task.initialize(model)
+

 if __name__ == "__main__":
  tf.test.main()