Add an encoder scaffold.

PiperOrigin-RevId: 286477560

Add an encoder scaffold.
PiperOrigin-RevId: 286477560
96095246 · A. Unique TensorFlower · 745e53a9 · 96095246 · 96095246 · 96095246
Commit 96095246 authored Dec 19, 2019 by A. Unique TensorFlower
6 changed files
--- a/official/nlp/modeling/layers/__init__.py
+++ b/official/nlp/modeling/layers/__init__.py
@@ -18,4 +18,5 @@ from official.nlp.modeling.layers.dense_einsum import DenseEinsum
 from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax
 from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
 from official.nlp.modeling.layers.position_embedding import PositionEmbedding
+from official.nlp.modeling.layers.self_attention_mask import SelfAttentionMask
 from official.nlp.modeling.layers.transformer import Transformer
--- a/official/nlp/modeling/layers/self_attention_mask.py
+++ b/official/nlp/modeling/layers/self_attention_mask.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras layer that creates a self-attention mask."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import tensorflow as tf
+from official.modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class SelfAttentionMask(tf.keras.layers.Layer):
+  """Create 3D attention mask from a 2D tensor mask.
+
+    inputs[0]: from_tensor: 2D or 3D Tensor of shape
+      [batch_size, from_seq_length, ...].
+    inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
+
+    Returns:
+      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
+  """
+
+  def call(self, inputs):
+    from_tensor = inputs[0]
+    to_mask = inputs[1]
+    from_shape = tf_utils.get_shape_list(from_tensor, expected_rank=[2, 3])
+    batch_size = from_shape[0]
+    from_seq_length = from_shape[1]
+
+    to_shape = tf_utils.get_shape_list(to_mask, expected_rank=2)
+    to_seq_length = to_shape[1]
+
+    to_mask = tf.cast(
+        tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
+        dtype=from_tensor.dtype)
+
+    # We don't assume that `from_tensor` is a mask (although it could be). We
+    # don't actually care if we attend *from* padding tokens (only *to* padding)
+    # tokens so we create a tensor of all ones.
+    #
+    # `broadcast_ones` = [batch_size, from_seq_length, 1]
+    broadcast_ones = tf.ones(
+        shape=[batch_size, from_seq_length, 1], dtype=from_tensor.dtype)
+
+    # Here we broadcast along two dimensions to create the mask.
+    mask = broadcast_ones * to_mask
+
+    return mask
--- a/official/nlp/modeling/networks/albert_transformer_encoder.py
+++ b/official/nlp/modeling/networks/albert_transformer_encoder.py
@@ -24,7 +24,6 @@ import tensorflow as tf
 from tensorflow.python.keras.engine import network  # pylint: disable=g-direct-tensorflow-import
 from official.modeling import activations
 from official.nlp.modeling import layers
-from official.nlp.modeling.networks import transformer_encoder


 @tf.keras.utils.register_keras_serializable(package='Text')
@@ -159,7 +158,7 @@ class AlbertTransformerEncoder(network.Network):
      embeddings = tf.cast(embeddings, tf.float16)

    data = embeddings
-    attention_mask = transformer_encoder.MakeAttentionMaskLayer()([data, mask])
+    attention_mask = layers.SelfAttentionMask()([data, mask])
    shared_layer = layers.Transformer(
        num_attention_heads=num_attention_heads,
        intermediate_size=intermediate_size,

--- a/official/nlp/modeling/networks/encoder_scaffold.py
+++ b/official/nlp/modeling/networks/encoder_scaffold.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Transformer-based text encoder network."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import inspect
+import tensorflow as tf
+
+from tensorflow.python.keras.engine import network  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class EncoderScaffold(network.Network):
+  """Bi-directional Transformer-based encoder network scaffold.
+
+  This network allows users to flexibly implement an encoder similar to the one
+  described in "BERT: Pre-training of Deep Bidirectional Transformers for
+  Language Understanding" (https://arxiv.org/abs/1810.04805).
+
+  In this network, users can choose to provide a custom embedding subnetwork
+  (which will replace the standard embedding logic) and/or a custom hidden layer
+  class (which will replace the Transformer instantiation in the encoder). For
+  each of these custom injection points, users can pass either a class or a
+  class instance. If a class is passed, that class will be instantiated using
+  the 'embedding_cfg' or 'hidden_cfg' argument, respectively; if an instance
+  is passed, that instance will be invoked. (In the case of hidden_cls, the
+  instance will be invoked 'num_hidden_instances' times.
+
+  If the hidden_cls is not overridden, a default transformer layer will be
+  instantiated.
+
+  Attributes:
+    num_output_classes: The output size of the classification layer.
+    classification_layer_initializer: The initializer for the classification
+      layer.
+    classification_layer_dtype: The dtype for the classification layer.
+    embedding_cls: The class or instance to use to embed the input data. This
+      class or instance defines the inputs to this encoder. If embedding_cls is
+      not set, a default embedding network (from the original BERT paper) will
+      be created.
+    embedding_cfg: A dict of kwargs to pass to the embedding_cls, if it needs to
+      be instantiated. If embedding_cls is not set, a config dict must be
+      passed to 'embedding_cfg' with the following values:
+      "vocab_size": The size of the token vocabulary.
+      "type_vocab_size": The size of the type vocabulary.
+      "hidden_size": The hidden size for this encoder.
+      "max_seq_length": The maximum sequence length for this encoder.
+      "seq_length": The sequence length for this encoder.
+      "initializer": The initializer for the embedding portion of this encoder.
+      "dropout_rate": The dropout rate to apply before the encoding layers.
+      "dtype": (Optional): The dtype of the embedding layers.
+    embedding_data: A reference to the embedding weights that will be used to
+      train the masked language model, if necessary. This is optional, and only
+      needed if (1) you are overriding embedding_cls and (2) are doing standard
+      pretraining.
+    num_hidden_instances: The number of times to instantiate and/or invoke the
+      hidden_cls.
+    hidden_cls: The class or instance to encode the input data. If hidden_cls is
+      not set, a KerasBERT transformer layer will be used as the encoder class.
+    hidden_cfg: A dict of kwargs to pass to the hidden_cls, if it needs to be
+      instantiated. If hidden_cls is not set, a config dict must be passed to
+      'hidden_cfg' with the following values:
+        "num_attention_heads": The number of attention heads. The hidden size
+          must be divisible by num_attention_heads.
+        "intermediate_size": The intermediate size of the transformer.
+        "intermediate_activation": The activation to apply in the transfomer.
+        "dropout_rate": The overall dropout rate for the transformer layers.
+        "attention_dropout_rate": The dropout rate for the attention layers.
+        "kernel_initializer": The initializer for the transformer layers.
+        "dtype": The dtype of the transformer.
+  """
+
+  def __init__(
+      self,
+      num_output_classes,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=0.02),
+      classification_layer_dtype=tf.float32,
+      embedding_cls=None,
+      embedding_cfg=None,
+      embedding_data=None,
+      num_hidden_instances=1,
+      hidden_cls=layers.Transformer,
+      hidden_cfg=None,
+      **kwargs):
+    print(embedding_cfg)
+    self._self_setattr_tracking = False
+    self._hidden_cls = hidden_cls
+    self._hidden_cfg = hidden_cfg
+    self._num_hidden_instances = num_hidden_instances
+    self._num_output_classes = num_output_classes
+    self._classification_layer_initializer = classification_layer_initializer
+    self._embedding_cls = embedding_cls
+    self._embedding_cfg = embedding_cfg
+    self._embedding_data = embedding_data
+    self._kwargs = kwargs
+
+    if embedding_cls:
+      if inspect.isclass(embedding_cls):
+        self._embedding_network = embedding_cls(embedding_cfg)
+      else:
+        self._embedding_network = embedding_cls
+      inputs = self._embedding_network.inputs
+      embeddings, mask = self._embedding_network(inputs)
+    else:
+      self._embedding_network = None
+      word_ids = tf.keras.layers.Input(
+          shape=(embedding_cfg['seq_length'],),
+          dtype=tf.int32,
+          name='input_word_ids')
+      mask = tf.keras.layers.Input(
+          shape=(embedding_cfg['seq_length'],),
+          dtype=tf.int32,
+          name='input_mask')
+      type_ids = tf.keras.layers.Input(
+          shape=(embedding_cfg['seq_length'],),
+          dtype=tf.int32,
+          name='input_type_ids')
+      inputs = [word_ids, mask, type_ids]
+
+      self._embedding_layer = layers.OnDeviceEmbedding(
+          vocab_size=embedding_cfg['vocab_size'],
+          embedding_width=embedding_cfg['hidden_size'],
+          initializer=embedding_cfg['initializer'],
+          name='word_embeddings')
+
+      word_embeddings = self._embedding_layer(word_ids)
+
+      # Always uses dynamic slicing for simplicity.
+      self._position_embedding_layer = layers.PositionEmbedding(
+          initializer=embedding_cfg['initializer'],
+          use_dynamic_slicing=True,
+          max_sequence_length=embedding_cfg['max_seq_length'])
+      position_embeddings = self._position_embedding_layer(word_embeddings)
+
+      type_embeddings = (
+          layers.OnDeviceEmbedding(
+              vocab_size=embedding_cfg['type_vocab_size'],
+              embedding_width=embedding_cfg['hidden_size'],
+              initializer=embedding_cfg['initializer'],
+              use_one_hot=True,
+              name='type_embeddings')(type_ids))
+
+      embeddings = tf.keras.layers.Add()(
+          [word_embeddings, position_embeddings, type_embeddings])
+      embeddings = (
+          tf.keras.layers.LayerNormalization(
+              name='embeddings/layer_norm',
+              axis=-1,
+              epsilon=1e-12,
+              dtype=tf.float32)(embeddings))
+      embeddings = (
+          tf.keras.layers.Dropout(
+              rate=embedding_cfg['dropout_rate'], dtype=tf.float32)(embeddings))
+
+      if embedding_cfg.get('dtype') == 'float16':
+        embeddings = tf.cast(embeddings, tf.float16)
+
+    attention_mask = layers.SelfAttentionMask()([embeddings, mask])
+    data = embeddings
+
+    for _ in range(num_hidden_instances):
+      if inspect.isclass(hidden_cls):
+        layer = self._hidden_cls(**hidden_cfg)
+      else:
+        layer = self._hidden_cls
+      data = layer([data, attention_mask])
+
+    first_token_tensor = (
+        tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data)
+    )
+    cls_output = tf.keras.layers.Dense(
+        units=num_output_classes,
+        activation='tanh',
+        kernel_initializer=classification_layer_initializer,
+        dtype=classification_layer_dtype,
+        name='cls_transform')(
+            first_token_tensor)
+
+    super(EncoderScaffold, self).__init__(
+        inputs=inputs, outputs=[data, cls_output], **kwargs)
+
+  def get_config(self):
+    config_dict = {
+        'num_hidden_instances':
+            self._num_hidden_instances,
+        'num_output_classes':
+            self._num_output_classes,
+        'classification_layer_initializer':
+            self._classification_layer_initializer,
+        'embedding_cls':
+            self._embedding_network,
+        'embedding_cfg':
+            self._embedding_cfg,
+        'hidden_cfg':
+            self._hidden_cfg,
+    }
+    if inspect.isclass(self._hidden_cls):
+      config_dict['hidden_cls_string'] = tf.keras.utils.get_registered_name(
+          self._hidden_cls)
+    else:
+      config_dict['hidden_cls'] = self._hidden_cls
+
+    config_dict.update(self._kwargs)
+    return config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    if 'hidden_cls_string' in config:
+      config['hidden_cls'] = tf.keras.utils.get_registered_object(
+          config['hidden_cls_string'], custom_objects=custom_objects)
+      del config['hidden_cls_string']
+    return cls(**config)
+
+  def get_embedding_table(self):
+    if self._embedding_network is None:
+      # In this case, we don't have a custom embedding network and can return
+      # the standard embedding data.
+      return self._embedding_layer.embeddings
+
+    if self._embedding_data is None:
+      raise RuntimeError(('The EncoderScaffold %s does not have a reference '
+                          'to the embedding data. This is required when you '
+                          'pass a custom embedding network to the scaffold. '
+                          'It is also possible that you are trying to get '
+                          'embedding data from an embedding scaffold with a '
+                          'custom embedding network where the scaffold has '
+                          'been serialized and deserialized. Unfortunately, '
+                          'accessing custom embedding references after '
+                          'serialization is not yet supported.') % self.name)
+    else:
+      return self._embedding_data
--- a/official/nlp/modeling/networks/encoder_scaffold_test.py
+++ b/official/nlp/modeling/networks/encoder_scaffold_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for transformer-based text encoder network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+from official.nlp.modeling import layers
+from official.nlp.modeling.networks import encoder_scaffold
+
+
+# Test class that wraps a standard transformer layer. If this layer is called
+# at any point, the list passed to the config object will be filled with a
+# boolean 'True'. We register this class as a Keras serializable so we can
+# test serialization below.
+@tf.keras.utils.register_keras_serializable(package="TestOnly")
+class ValidatedTransformerLayer(layers.Transformer):
+
+  def __init__(self, call_list, **kwargs):
+    super(ValidatedTransformerLayer, self).__init__(**kwargs)
+    self.list = call_list
+
+  def call(self, inputs):
+    self.list.append(True)
+    return super(ValidatedTransformerLayer, self).call(inputs)
+
+  def get_config(self):
+    config = super(ValidatedTransformerLayer, self).get_config()
+    config["call_list"] = []
+    return config
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
+
+  def test_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    num_hidden_instances = 3
+    embedding_cfg = {
+        "vocab_size": 100,
+        "type_vocab_size": 16,
+        "hidden_size": hidden_size,
+        "seq_length": sequence_length,
+        "max_seq_length": sequence_length,
+        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dropout_rate": 0.1,
+    }
+
+    call_list = []
+    hidden_cfg = {
+        "num_attention_heads":
+            2,
+        "intermediate_size":
+            3072,
+        "intermediate_activation":
+            activations.gelu,
+        "dropout_rate":
+            0.1,
+        "attention_dropout_rate":
+            0.1,
+        "kernel_initializer":
+            tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dtype":
+            "float32",
+        "call_list":
+            call_list
+    }
+    # Create a small EncoderScaffold for testing.
+    test_network = encoder_scaffold.EncoderScaffold(
+        num_hidden_instances=num_hidden_instances,
+        num_output_classes=hidden_size,
+        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=0.02),
+        hidden_cls=ValidatedTransformerLayer,
+        hidden_cfg=hidden_cfg,
+        embedding_cfg=embedding_cfg)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    data, pooled = test_network([word_ids, mask, type_ids])
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+
+    # If call_list[0] exists and is True, the passed layer class was
+    # instantiated from the given config properly.
+    self.assertNotEmpty(call_list)
+    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
+
+  def test_network_creation_with_float16_dtype(self):
+    tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
+    hidden_size = 32
+    sequence_length = 21
+    embedding_cfg = {
+        "vocab_size": 100,
+        "type_vocab_size": 16,
+        "hidden_size": hidden_size,
+        "seq_length": sequence_length,
+        "max_seq_length": sequence_length,
+        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dropout_rate": 0.1,
+        "dtype": "float16",
+    }
+    hidden_cfg = {
+        "num_attention_heads":
+            2,
+        "intermediate_size":
+            3072,
+        "intermediate_activation":
+            activations.gelu,
+        "dropout_rate":
+            0.1,
+        "attention_dropout_rate":
+            0.1,
+        "kernel_initializer":
+            tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dtype":
+            "float16",
+    }
+    # Create a small EncoderScaffold for testing.
+    test_network = encoder_scaffold.EncoderScaffold(
+        num_hidden_instances=3,
+        num_output_classes=hidden_size,
+        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=0.02),
+        classification_layer_dtype=tf.float16,
+        hidden_cfg=hidden_cfg,
+        embedding_cfg=embedding_cfg)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    data, pooled = test_network([word_ids, mask, type_ids])
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # If float_dtype is set to float16, the output should always be float16.
+    self.assertAllEqual(tf.float16, data.dtype)
+    self.assertAllEqual(tf.float16, pooled.dtype)
+
+  def test_network_invocation(self):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+    embedding_cfg = {
+        "vocab_size": vocab_size,
+        "type_vocab_size": num_types,
+        "hidden_size": hidden_size,
+        "seq_length": sequence_length,
+        "max_seq_length": sequence_length,
+        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dropout_rate": 0.1,
+    }
+    hidden_cfg = {
+        "num_attention_heads":
+            2,
+        "intermediate_size":
+            3072,
+        "intermediate_activation":
+            activations.gelu,
+        "dropout_rate":
+            0.1,
+        "attention_dropout_rate":
+            0.1,
+        "kernel_initializer":
+            tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dtype":
+            "float32",
+    }
+    tf.keras.mixed_precision.experimental.set_policy("float32")
+    print(hidden_cfg)
+    print(embedding_cfg)
+    # Create a small EncoderScaffold for testing.
+    test_network = encoder_scaffold.EncoderScaffold(
+        num_hidden_instances=3,
+        num_output_classes=hidden_size,
+        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=0.02),
+        hidden_cfg=hidden_cfg,
+        embedding_cfg=embedding_cfg)
+
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    data, pooled = test_network([word_ids, mask, type_ids])
+
+    # Create a model based off of this network:
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+
+    # Invoke the model. We can't validate the output data here (the model is too
+    # complex) but this will catch structural runtime errors.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    _ = model.predict([word_id_data, mask_data, type_id_data])
+
+    # Creates a EncoderScaffold with max_sequence_length != sequence_length
+    num_types = 7
+    embedding_cfg = {
+        "vocab_size": vocab_size,
+        "type_vocab_size": num_types,
+        "hidden_size": hidden_size,
+        "seq_length": sequence_length,
+        "max_seq_length": sequence_length * 2,
+        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dropout_rate": 0.1,
+    }
+    hidden_cfg = {
+        "num_attention_heads":
+            2,
+        "intermediate_size":
+            3072,
+        "intermediate_activation":
+            activations.gelu,
+        "dropout_rate":
+            0.1,
+        "attention_dropout_rate":
+            0.1,
+        "kernel_initializer":
+            tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    }
+    # Create a small EncoderScaffold for testing.
+    test_network = encoder_scaffold.EncoderScaffold(
+        num_hidden_instances=3,
+        num_output_classes=hidden_size,
+        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=0.02),
+        hidden_cfg=hidden_cfg,
+        embedding_cfg=embedding_cfg)
+
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    _ = model.predict([word_id_data, mask_data, type_id_data])
+
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    hidden_size = 32
+    sequence_length = 21
+    embedding_cfg = {
+        "vocab_size": 100,
+        "type_vocab_size": 16,
+        "hidden_size": hidden_size,
+        "seq_length": sequence_length,
+        "max_seq_length": sequence_length,
+        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dropout_rate": 0.1,
+    }
+    hidden_cfg = {
+        "num_attention_heads":
+            2,
+        "intermediate_size":
+            3072,
+        "intermediate_activation":
+            activations.gelu,
+        "dropout_rate":
+            0.1,
+        "attention_dropout_rate":
+            0.1,
+        "kernel_initializer":
+            tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dtype":
+            "float32",
+    }
+    # Create a small EncoderScaffold for testing.
+    network = encoder_scaffold.EncoderScaffold(
+        num_hidden_instances=3,
+        num_output_classes=hidden_size,
+        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=0.02),
+        hidden_cfg=hidden_cfg,
+        embedding_cfg=embedding_cfg)
+
+    # Create another network object from the first object's config.
+    new_network = encoder_scaffold.EncoderScaffold.from_config(
+        network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = new_network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+
+
+@keras_parameterized.run_all_keras_modes
+class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
+
+  def test_network_invocation(self):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+
+    # Build an embedding network to swap in for the default network. This one
+    # will have 2 inputs (mask and word_ids) instead of 3, and won't use
+    # positional embeddings.
+
+    word_ids = tf.keras.layers.Input(
+        shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
+    mask = tf.keras.layers.Input(
+        shape=(sequence_length,), dtype=tf.int32, name="input_mask")
+    embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=hidden_size,
+        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        name="word_embeddings")
+    word_embeddings = embedding_layer(word_ids)
+    network = tf.keras.Model([word_ids, mask], [word_embeddings, mask])
+
+    hidden_cfg = {
+        "num_attention_heads":
+            2,
+        "intermediate_size":
+            3072,
+        "intermediate_activation":
+            activations.gelu,
+        "dropout_rate":
+            0.1,
+        "attention_dropout_rate":
+            0.1,
+        "kernel_initializer":
+            tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dtype":
+            "float32",
+    }
+
+    # Create a small EncoderScaffold for testing.
+    test_network = encoder_scaffold.EncoderScaffold(
+        num_hidden_instances=3,
+        num_output_classes=hidden_size,
+        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=0.02),
+        hidden_cfg=hidden_cfg,
+        embedding_cls=network,
+        embedding_data=embedding_layer.embeddings)
+
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    data, pooled = test_network([word_ids, mask])
+
+    # Create a model based off of this network:
+    model = tf.keras.Model([word_ids, mask], [data, pooled])
+
+    # Invoke the model. We can't validate the output data here (the model is too
+    # complex) but this will catch structural runtime errors.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    _ = model.predict([word_id_data, mask_data])
+
+    # Test that we can get the embedding data that we passed to the object. This
+    # is necessary to support standard language model training.
+    self.assertIs(embedding_layer.embeddings,
+                  test_network.get_embedding_table())
+
+  def test_serialize_deserialize(self):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+
+    # Build an embedding network to swap in for the default network. This one
+    # will have 2 inputs (mask and word_ids) instead of 3, and won't use
+    # positional embeddings.
+
+    word_ids = tf.keras.layers.Input(
+        shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
+    mask = tf.keras.layers.Input(
+        shape=(sequence_length,), dtype=tf.int32, name="input_mask")
+    embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=hidden_size,
+        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        name="word_embeddings")
+    word_embeddings = embedding_layer(word_ids)
+    network = tf.keras.Model([word_ids, mask], [word_embeddings, mask])
+
+    hidden_cfg = {
+        "num_attention_heads":
+            2,
+        "intermediate_size":
+            3072,
+        "intermediate_activation":
+            activations.gelu,
+        "dropout_rate":
+            0.1,
+        "attention_dropout_rate":
+            0.1,
+        "kernel_initializer":
+            tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dtype":
+            "float32",
+    }
+
+    # Create a small EncoderScaffold for testing.
+    test_network = encoder_scaffold.EncoderScaffold(
+        num_hidden_instances=3,
+        num_output_classes=hidden_size,
+        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=0.02),
+        hidden_cfg=hidden_cfg,
+        embedding_cls=network,
+        embedding_data=embedding_layer.embeddings)
+
+    # Create another network object from the first object's config.
+    new_network = encoder_scaffold.EncoderScaffold.from_config(
+        test_network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = new_network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(test_network.get_config(), new_network.get_config())
+
+    # Create a model based off of the old and new networks:
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+
+    data, pooled = new_network([word_ids, mask])
+    new_model = tf.keras.Model([word_ids, mask], [data, pooled])
+
+    data, pooled = test_network([word_ids, mask])
+    model = tf.keras.Model([word_ids, mask], [data, pooled])
+
+    # Copy the weights between models.
+    new_model.set_weights(model.get_weights())
+
+    # Invoke the models.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    data, cls = model.predict([word_id_data, mask_data])
+    new_data, new_cls = new_model.predict([word_id_data, mask_data])
+
+    # The output should be equal.
+    self.assertAllEqual(data, new_data)
+    self.assertAllEqual(cls, new_cls)
+
+    # We should not be able to get a reference to the embedding data.
+    with self.assertRaisesRegex(RuntimeError, ".*does not have a reference.*"):
+      new_network.get_embedding_table()
+
+
+@keras_parameterized.run_all_keras_modes
+class EncoderScaffoldHiddenInstanceTest(keras_parameterized.TestCase):
+
+  def test_network_invocation(self):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+
+    embedding_cfg = {
+        "vocab_size": vocab_size,
+        "type_vocab_size": num_types,
+        "hidden_size": hidden_size,
+        "seq_length": sequence_length,
+        "max_seq_length": sequence_length,
+        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dropout_rate": 0.1,
+        "dtype": "float32",
+    }
+
+    call_list = []
+    hidden_cfg = {
+        "num_attention_heads":
+            2,
+        "intermediate_size":
+            3072,
+        "intermediate_activation":
+            activations.gelu,
+        "dropout_rate":
+            0.1,
+        "attention_dropout_rate":
+            0.1,
+        "kernel_initializer":
+            tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dtype":
+            "float32",
+        "call_list":
+            call_list
+    }
+    # Create a small EncoderScaffold for testing. This time, we pass an already-
+    # instantiated layer object.
+
+    xformer = ValidatedTransformerLayer(**hidden_cfg)
+
+    test_network = encoder_scaffold.EncoderScaffold(
+        num_hidden_instances=3,
+        num_output_classes=hidden_size,
+        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=0.02),
+        hidden_cls=xformer,
+        embedding_cfg=embedding_cfg)
+
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    data, pooled = test_network([word_ids, mask, type_ids])
+
+    # Create a model based off of this network:
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+
+    # Invoke the model. We can't validate the output data here (the model is too
+    # complex) but this will catch structural runtime errors.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    _ = model.predict([word_id_data, mask_data, type_id_data])
+
+    # If call_list[0] exists and is True, the passed layer class was
+    # called as part of the graph creation.
+    self.assertNotEmpty(call_list)
+    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
+
+  def test_serialize_deserialize(self):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+
+    embedding_cfg = {
+        "vocab_size": vocab_size,
+        "type_vocab_size": num_types,
+        "hidden_size": hidden_size,
+        "seq_length": sequence_length,
+        "max_seq_length": sequence_length,
+        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dropout_rate": 0.1,
+        "dtype": "float32",
+    }
+
+    call_list = []
+    hidden_cfg = {
+        "num_attention_heads":
+            2,
+        "intermediate_size":
+            3072,
+        "intermediate_activation":
+            activations.gelu,
+        "dropout_rate":
+            0.1,
+        "attention_dropout_rate":
+            0.1,
+        "kernel_initializer":
+            tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        "dtype":
+            "float32",
+        "call_list":
+            call_list
+    }
+    # Create a small EncoderScaffold for testing. This time, we pass an already-
+    # instantiated layer object.
+
+    xformer = ValidatedTransformerLayer(**hidden_cfg)
+
+    test_network = encoder_scaffold.EncoderScaffold(
+        num_hidden_instances=3,
+        num_output_classes=hidden_size,
+        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=0.02),
+        hidden_cls=xformer,
+        embedding_cfg=embedding_cfg)
+
+    # Create another network object from the first object's config.
+    new_network = encoder_scaffold.EncoderScaffold.from_config(
+        test_network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = new_network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(test_network.get_config(), new_network.get_config())
+
+    # Create a model based off of the old and new networks:
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+
+    data, pooled = new_network([word_ids, mask, type_ids])
+    new_model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+
+    data, pooled = test_network([word_ids, mask, type_ids])
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+
+    # Copy the weights between models.
+    new_model.set_weights(model.get_weights())
+
+    # Invoke the models.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    data, cls = model.predict([word_id_data, mask_data, type_id_data])
+    new_data, new_cls = new_model.predict(
+        [word_id_data, mask_data, type_id_data])
+
+    # The output should be equal.
+    self.assertAllEqual(data, new_data)
+    self.assertAllEqual(cls, new_cls)
+
+
+if __name__ == "__main__":
+  assert tf.version.VERSION.startswith('2.')
+  tf.test.main()
--- a/official/nlp/modeling/networks/transformer_encoder.py
+++ b/official/nlp/modeling/networks/transformer_encoder.py
@@ -23,7 +23,6 @@ import tensorflow as tf

 from tensorflow.python.keras.engine import network  # pylint: disable=g-direct-tensorflow-import
 from official.modeling import activations
-from official.nlp import bert_modeling
 from official.nlp.modeling import layers


@@ -145,7 +144,7 @@ class TransformerEncoder(network.Network):
      embeddings = tf.cast(embeddings, tf.float16)

    data = embeddings
-    attention_mask = MakeAttentionMaskLayer()([data, mask])
+    attention_mask = layers.SelfAttentionMask()([data, mask])
    for i in range(num_layers):
      layer = layers.Transformer(
          num_attention_heads=num_attention_heads,
@@ -182,11 +181,3 @@ class TransformerEncoder(network.Network):
  @classmethod
  def from_config(cls, config, custom_objects=None):
    return cls(**config)
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class MakeAttentionMaskLayer(tf.keras.layers.Layer):
-
-  def call(self, inputs):
-    return bert_modeling.create_attention_mask_from_input_mask(
-        inputs[0], inputs[1])