Clean keras_nlp.BertEncoder:

Remove return_all_encoder_outputs argument from keras_nlp.BertEncoder as it is unused and we will always return all encoder outputs in the returned dict. PiperOrigin-RevId: 332802761

Clean keras_nlp.BertEncoder:
Remove return_all_encoder_outputs argument from keras_nlp.BertEncoder as it is unused and we will always return all encoder outputs in the returned dict. PiperOrigin-RevId: 332802761
71a2fc91 · Chen Chen · A. Unique TensorFlower · 1308ecdc · 71a2fc91 · 71a2fc91
Commit 71a2fc91 authored Sep 21, 2020 by Chen Chen Committed by A. Unique TensorFlower Sep 21, 2020
3 changed files
--- a/official/nlp/keras_nlp/encoders/bert_encoder.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder.py
@@ -56,8 +56,6 @@ class BertEncoder(tf.keras.Model):
    attention_dropout: The dropout rate to use for the attention layers
      within the transformer layers.
    initializer: The initialzer to use for all weights in this encoder.
-    return_all_encoder_outputs: Whether to output sequence embedding outputs of
-      all encoder transformer layers.
    output_range: The sequence output range, [0, output_range), by slicing the
      target sequence of the last transformer layer. `None` means the entire
      target sequence will attend to the source sequence, which yeilds the full
@@ -82,7 +80,6 @@ class BertEncoder(tf.keras.Model):
      output_dropout=0.1,
      attention_dropout=0.1,
      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      return_all_encoder_outputs=False,
      output_range=None,
      embedding_width=None,
      **kwargs):
@@ -102,7 +99,6 @@ class BertEncoder(tf.keras.Model):
        'output_dropout': output_dropout,
        'attention_dropout': attention_dropout,
        'initializer': tf.keras.initializers.serialize(initializer),
-        'return_all_encoder_outputs': return_all_encoder_outputs,
        'output_range': output_range,
        'embedding_width': embedding_width,
    }

--- a/official/nlp/keras_nlp/encoders/bert_encoder_test.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder_test.py
@@ -69,8 +69,7 @@ class BertEncoderTest(keras_parameterized.TestCase):
        vocab_size=100,
        hidden_size=hidden_size,
        num_attention_heads=2,
-        num_layers=3,
-        return_all_encoder_outputs=True)
+        num_layers=3)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -204,7 +203,6 @@ class BertEncoderTest(keras_parameterized.TestCase):
        output_dropout=0.05,
        attention_dropout=0.22,
        initializer="glorot_uniform",
-        return_all_encoder_outputs=False,
        output_range=-1,
        embedding_width=16)
    network = bert_encoder.BertEncoder(**kwargs)

--- a/official/nlp/modeling/networks/bert_encoder.py
+++ b/official/nlp/modeling/networks/bert_encoder.py
@@ -60,7 +60,9 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
      within the transformer layers.
    initializer: The initialzer to use for all weights in this encoder.
    return_all_encoder_outputs: Whether to output sequence embedding outputs of
-      all encoder transformer layers.
+      all encoder transformer layers. Note: when the following `dict_outputs`
+      argument is True, all encoder outputs are always returned in the dict,
+      keyed by `encoder_outputs`.
    output_range: The sequence output range, [0, output_range), by slicing the
      target sequence of the last transformer layer. `None` means the entire
      target sequence will attend to the source sequence, which yeilds the full
@@ -112,7 +114,6 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
        output_dropout=dropout_rate,
        attention_dropout=attention_dropout_rate,
        initializer=initializer,
-        return_all_encoder_outputs=return_all_encoder_outputs,
        output_range=output_range,
        embedding_width=embedding_width)

@@ -123,6 +124,7 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
    self._config_dict['attention_dropout_rate'] = self._config_dict.pop(
        'attention_dropout')
    self._config_dict['dict_outputs'] = dict_outputs
+    self._config_dict['return_all_encoder_outputs'] = return_all_encoder_outputs

    if dict_outputs:
      return