Internal change

PiperOrigin-RevId: 333784610

Internal change
PiperOrigin-RevId: 333784610
2a5c349d · Chen Chen · A. Unique TensorFlower · 07a07f6a · 2a5c349d · 2a5c349d
Commit 2a5c349d authored Sep 25, 2020 by Chen Chen Committed by A. Unique TensorFlower Sep 25, 2020
8 changed files
--- a/official/nlp/albert/configs.py
+++ b/official/nlp/albert/configs.py
@@ -40,8 +40,7 @@ class AlbertConfig(configs.BertConfig):
    super(AlbertConfig, self).__init__(**kwargs)
    # TODO(chendouble): 'inner_group_num' and 'num_hidden_groups' are always 1
-    # in the released ALBERT. Support other values in AlbertTransformerEncoder
+    # in the released ALBERT. Support other values in AlbertEncoder if needed.
-    # if needed.
    if inner_group_num != 1 or num_hidden_groups != 1:
      raise ValueError("We only support 'inner_group_num' and "
                       "'num_hidden_groups' as 1.")

--- a/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
+++ b/official/nlp/albert/tf2_albert_encoder_checkpoint_converter.py
@@ -15,7 +15,7 @@
 """A converter from a tf1 ALBERT encoder checkpoint to a tf2 encoder checkpoint.
 The conversion will yield an object-oriented checkpoint that can be used
-to restore a AlbertTransformerEncoder object.
+to restore an AlbertEncoder object.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -81,7 +81,7 @@ def _create_albert_model(cfg):
  Returns:
    A keras model.
  """
-  albert_encoder = networks.AlbertTransformerEncoder(
+  albert_encoder = networks.AlbertEncoder(
      vocab_size=cfg.vocab_size,
      hidden_size=cfg.hidden_size,
      embedding_width=cfg.embedding_size,

--- a/official/nlp/bert/bert_models.py
+++ b/official/nlp/bert/bert_models.py
@@ -167,7 +167,7 @@ def get_transformer_encoder(bert_config,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=bert_config.initializer_range))
  if isinstance(bert_config, albert_configs.AlbertConfig):
-    return networks.AlbertTransformerEncoder(**kwargs)
+    return networks.AlbertEncoder(**kwargs)
  else:
    assert isinstance(bert_config, configs.BertConfig)
    kwargs['output_range'] = output_range

--- a/official/nlp/configs/encoders.py
+++ b/official/nlp/configs/encoders.py
@@ -149,7 +149,7 @@ class EncoderConfig(hyperparams.OneOfConfig):
 ENCODER_CLS = {
    "bert": networks.BertEncoder,
    "mobilebert": networks.MobileBERTEncoder,
-    "albert": networks.AlbertTransformerEncoder,
+    "albert": networks.AlbertEncoder,
    "bigbird": bigbird_encoder.BigBirdEncoder,
 }

--- a/official/nlp/modeling/networks/README.md
+++ b/official/nlp/modeling/networks/README.md
@@ -10,7 +10,7 @@ Transformer-based encoder as described in ["BERT: Pre-training of Deep
 Bidirectional Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding lookups,
 transformer layers and pooling layer.
-* [`AlbertTransformerEncoder`](albert_transformer_encoder.py) implements a
+* [`AlbertEncoder`](albert_encoder.py) implements a
 Transformer-encoder described in the paper ["ALBERT: A Lite BERT for
 Self-supervised Learning of Language Representations"]
 (https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805), ALBERT refactorizes embedding parameters

--- a/official/nlp/modeling/networks/__init__.py
+++ b/official/nlp/modeling/networks/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Networks package definition."""
-from official.nlp.modeling.networks.albert_transformer_encoder import AlbertTransformerEncoder
+from official.nlp.modeling.networks.albert_encoder import AlbertEncoder
 from official.nlp.modeling.networks.bert_encoder import BertEncoder
 from official.nlp.modeling.networks.classification import Classification
 from official.nlp.modeling.networks.encoder_scaffold import EncoderScaffold

--- a/official/nlp/modeling/networks/albert_transformer_encoder.py
+++ b/official/nlp/modeling/networks/albert_transformer_encoder.py
@@ -23,7 +23,7 @@ from official.nlp.modeling import layers
 @tf.keras.utils.register_keras_serializable(package='Text')
-class AlbertTransformerEncoder(tf.keras.Model):
+class AlbertEncoder(tf.keras.Model):
  """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network.
  This network implements the encoder described in the paper "ALBERT: A Lite
@@ -158,8 +158,10 @@ class AlbertTransformerEncoder(tf.keras.Model):
        attention_dropout=attention_dropout_rate,
        kernel_initializer=initializer,
        name='transformer')
+    encoder_outputs = []
    for _ in range(num_layers):
      data = shared_layer([data, attention_mask])
+      encoder_outputs.append(data)
    first_token_tensor = (
        tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data)
@@ -173,12 +175,13 @@ class AlbertTransformerEncoder(tf.keras.Model):
    if dict_outputs:
      outputs = dict(
          sequence_output=data,
+          encoder_outputs=encoder_outputs,
          pooled_output=cls_output,
      )
    else:
      outputs = [data, cls_output]
-    super(AlbertTransformerEncoder, self).__init__(
+    super(AlbertEncoder, self).__init__(
        inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
  def get_embedding_table(self):

--- a/official/nlp/modeling/networks/albert_transformer_encoder_test.py
+++ b/official/nlp/modeling/networks/albert_transformer_encoder_test.py
@@ -23,16 +23,16 @@ import numpy as np
 import tensorflow as tf
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.networks import albert_transformer_encoder
+from official.nlp.modeling.networks import albert_encoder
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
-class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
+class AlbertEncoderTest(keras_parameterized.TestCase):
  def tearDown(self):
-    super(AlbertTransformerEncoderTest, self).tearDown()
+    super(AlbertEncoderTest, self).tearDown()
    tf.keras.mixed_precision.experimental.set_policy("float32")
  @parameterized.named_parameters(
@@ -52,7 +52,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
      tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
    # Create a small TransformerEncoder for testing.
-    test_network = albert_transformer_encoder.AlbertTransformerEncoder(**kwargs)
+    test_network = albert_encoder.AlbertEncoder(**kwargs)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -84,13 +84,14 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
    sequence_length = 21
    vocab_size = 57
    num_types = 7
+    num_layers = 3
    # Create a small TransformerEncoder for testing.
-    test_network = albert_transformer_encoder.AlbertTransformerEncoder(
+    test_network = albert_encoder.AlbertEncoder(
        vocab_size=vocab_size,
        embedding_width=8,
        hidden_size=hidden_size,
        num_attention_heads=2,
-        num_layers=3,
+        num_layers=num_layers,
        type_vocab_size=num_types)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -113,25 +114,25 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
    # Creates a TransformerEncoder with max_sequence_length != sequence_length
    max_sequence_length = 128
-    test_network = albert_transformer_encoder.AlbertTransformerEncoder(
+    test_network = albert_encoder.AlbertEncoder(
        vocab_size=vocab_size,
        embedding_width=8,
        hidden_size=hidden_size,
        max_sequence_length=max_sequence_length,
        num_attention_heads=2,
-        num_layers=3,
+        num_layers=num_layers,
        type_vocab_size=num_types)
    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
    _ = model.predict([word_id_data, mask_data, type_id_data])
    # Tests dictionary outputs.
-    test_network_dict = albert_transformer_encoder.AlbertTransformerEncoder(
+    test_network_dict = albert_encoder.AlbertEncoder(
        vocab_size=vocab_size,
        embedding_width=8,
        hidden_size=hidden_size,
        max_sequence_length=max_sequence_length,
        num_attention_heads=2,
-        num_layers=3,
+        num_layers=num_layers,
        type_vocab_size=num_types,
        dict_outputs=True)
    _ = test_network_dict([word_ids, mask, type_ids])
@@ -144,6 +145,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
            input_type_ids=type_id_data))
    self.assertAllEqual(list_outputs[0], dict_outputs["sequence_output"])
    self.assertAllEqual(list_outputs[1], dict_outputs["pooled_output"])
+    self.assertLen(dict_outputs["pooled_output"], num_layers)
  def test_serialize_deserialize(self):
    tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
@@ -161,7 +163,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
        dropout_rate=0.05,
        attention_dropout_rate=0.22,
        initializer="glorot_uniform")
-    network = albert_transformer_encoder.AlbertTransformerEncoder(**kwargs)
+    network = albert_encoder.AlbertEncoder(**kwargs)
    expected_config = dict(kwargs)
    expected_config["activation"] = tf.keras.activations.serialize(
@@ -172,7 +174,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
    # Create another network object from the first object's config.
    new_network = (
-        albert_transformer_encoder.AlbertTransformerEncoder.from_config(
+        albert_encoder.AlbertEncoder.from_config(
            network.get_config()))
    # Validate that the config can be forced to JSON.