Commit b0ccdb11 authored by Shixin Luo's avatar Shixin Luo
Browse files

resolve conflict with master

parents e61588cd 1611a8c5
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Transformer XL."""
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import transformer_xl
def create_mock_transformer_xl_data(
batch_size,
num_heads,
head_size,
hidden_size,
seq_length,
memory_length=0,
num_predictions=2,
two_stream=False,
num_layers=1,
include_biases=True,
include_state=False,
include_mask=False,
include_segment=False):
"""Creates mock testing data.
Args:
batch_size: `int`, the batch size.
num_heads: `int`, number of attention heads.
head_size: `int`, the size of each attention head.
hidden_size: `int`, the layer's hidden size.
seq_length: `int`, Sequence length of the input.
memory_length: optional `int`, the length of the state. Defaults to 0.
num_predictions: `int`, the number of predictions used in two stream
attention.
two_stream: `bool`, whether or not to generate two stream data.
num_layers: `int`, the number of Transformer XL blocks.
include_biases: optional `bool`, whether or not to include attention biases.
include_state: optional `bool`, whether or not to include state data.
include_mask: optional `bool`, whether or not to include mask data.
include_segment: optional `bool`, whether or not to include segment data.
Returns:
A dictionary with `str` as keys and `Tensor` as values.
"""
encoding_shape = (batch_size, seq_length * 2, hidden_size)
data = dict(
relative_position_encoding=tf.random.normal(shape=encoding_shape),
content_stream=tf.random.normal(
shape=(batch_size, seq_length, hidden_size)))
if include_biases:
attention_bias_shape = (num_heads, head_size)
data.update(dict(
content_attention_bias=tf.random.normal(shape=attention_bias_shape),
segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
positional_attention_bias=tf.random.normal(shape=attention_bias_shape)))
if two_stream:
data.update(dict(
query_stream=tf.random.normal(
shape=(batch_size, num_predictions, hidden_size)),
target_mapping=tf.random.normal(
shape=(batch_size, num_predictions, seq_length))))
if include_state:
total_seq_length = seq_length + memory_length
if num_layers > 1:
state_shape = (num_layers, batch_size, memory_length, hidden_size)
else:
state_shape = (batch_size, memory_length, hidden_size)
data.update(dict(
state=tf.random.normal(shape=state_shape)))
else:
total_seq_length = seq_length
if include_mask:
mask_shape = (batch_size, num_heads, seq_length, total_seq_length)
mask_data = np.random.randint(2, size=mask_shape).astype("float32")
data["content_attention_mask"] = mask_data
if two_stream:
data["query_attention_mask"] = mask_data
if include_segment:
# A transformer XL block takes an individual segment "encoding" from the
# entirety of the Transformer XL segment "embedding".
if num_layers > 1:
segment_encoding_shape = (num_layers, 2, num_heads, head_size)
segment_encoding_name = "segment_embedding"
else:
segment_encoding_shape = (2, num_heads, head_size)
segment_encoding_name = "segment_encoding"
segment_matrix = np.random.randint(
2, size=(batch_size, seq_length, total_seq_length))
data["segment_matrix"] = tf.math.equal(segment_matrix, 1)
data[segment_encoding_name] = tf.random.normal(shape=segment_encoding_shape)
return data
@keras_parameterized.run_all_keras_modes
class TransformerXLBlockTest(keras_parameterized.TestCase):
@combinations.generate(combinations.combine(
memory_length=[0, 4],
two_stream=[True, False],
state=[True, False],
mask=[True, False],
segment=[True, False]))
def test_transformer_xl_block(
self,
two_stream,
memory_length,
state,
mask,
segment):
"""Tests combinations of Transformer XL block calculations."""
batch_size, num_heads, head_size, seq_length = 2, 12, 64, 8
hidden_size, num_predictions, inner_size = 24, 8, 12
data = create_mock_transformer_xl_data(
include_biases=True,
num_heads=num_heads,
head_size=head_size,
hidden_size=hidden_size,
seq_length=seq_length,
batch_size=batch_size,
memory_length=memory_length,
num_predictions=num_predictions,
two_stream=two_stream,
include_state=state,
include_mask=mask,
include_segment=segment)
test_layer = transformer_xl.TransformerXLBlock(
vocab_size=32000,
hidden_size=hidden_size,
num_attention_heads=num_heads,
head_size=head_size,
inner_size=inner_size,
dropout_rate=0.,
attention_dropout_rate=0.,
two_stream=two_stream)
output = test_layer(**data)
content_attention = output["content_attention"]
self.assertEqual(content_attention.shape,
[batch_size, seq_length, hidden_size])
if two_stream:
self.assertIn("query_attention", output)
self.assertEqual(output["query_attention"].shape,
[batch_size, num_predictions, hidden_size])
else:
self.assertNotIn("query_attention", output)
def test_get_config(self):
transformer_xl_block = transformer_xl.TransformerXLBlock(
vocab_size=32000,
head_size=64,
num_attention_heads=2,
hidden_size=10,
inner_size=50,
dropout_rate=0.,
attention_dropout_rate=0.,
two_stream=False)
transformer_xl_block_config = transformer_xl_block.get_config()
new_block = transformer_xl.TransformerXLBlock.from_config(
transformer_xl_block_config)
self.assertEqual(transformer_xl_block_config, new_block.get_config())
@keras_parameterized.run_all_keras_modes
class TransformerXLTest(keras_parameterized.TestCase):
@combinations.generate(combinations.combine(
two_stream=[True, False],
memory_length=[0, 4],
reuse_length=[0, 4],
tie_attention_biases=[True, False],
state=[True, False],
mask=[True, False],
segment=[True, False]))
def test_transformer_xl(
self,
two_stream,
memory_length,
reuse_length,
tie_attention_biases,
state,
mask,
segment):
batch_size, num_heads, head_size, seq_length = 2, 12, 64, 8
hidden_size, num_predictions, inner_size = 24, 8, 12
num_layers = 3
data = create_mock_transformer_xl_data(
include_biases=False,
num_heads=num_heads,
head_size=head_size,
hidden_size=hidden_size,
seq_length=seq_length,
batch_size=batch_size,
memory_length=memory_length,
num_predictions=num_predictions,
two_stream=two_stream,
num_layers=num_layers,
include_state=state,
include_mask=mask,
include_segment=segment)
transformer_xl_layer = transformer_xl.TransformerXL(
vocab_size=32000,
num_layers=num_layers,
head_size=head_size,
hidden_size=hidden_size,
num_attention_heads=num_heads,
inner_size=inner_size,
dropout_rate=0.,
attention_dropout_rate=0.,
initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
two_stream=two_stream,
tie_attention_biases=tie_attention_biases,
memory_length=memory_length,
reuse_length=reuse_length,
inner_activation="relu")
attention_output, cached_memory_states = transformer_xl_layer(**data)
if two_stream:
self.assertEqual(attention_output.shape,
[batch_size, num_predictions, hidden_size])
else:
self.assertEqual(attention_output.shape,
[batch_size, seq_length, hidden_size])
self.assertEqual(len(cached_memory_states), num_layers)
def test_get_config(self):
transformer_xl_layer = transformer_xl.TransformerXL(
vocab_size=32000,
num_layers=12,
hidden_size=36,
head_size=12,
num_attention_heads=12,
inner_size=12,
dropout_rate=0.,
attention_dropout_rate=0.,
initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
two_stream=False,
tie_attention_biases=True,
memory_length=0,
reuse_length=0,
inner_activation="relu")
transformer_xl_config = transformer_xl_layer.get_config()
new_transformer_xl = transformer_xl.TransformerXL.from_config(
transformer_xl_config)
self.assertEqual(transformer_xl_config, new_transformer_xl.get_config())
if __name__ == "__main__":
np.random.seed(0)
tf.random.set_seed(0)
tf.test.main()
...@@ -72,7 +72,11 @@ class BertClassifier(tf.keras.Model): ...@@ -72,7 +72,11 @@ class BertClassifier(tf.keras.Model):
if use_encoder_pooler: if use_encoder_pooler:
# Because we have a copy of inputs to create this Model object, we can # Because we have a copy of inputs to create this Model object, we can
# invoke the Network object with its own input tensors to start the Model. # invoke the Network object with its own input tensors to start the Model.
_, cls_output = network(inputs) outputs = network(inputs)
if isinstance(outputs, list):
cls_output = outputs[1]
else:
cls_output = outputs['pooled_output']
cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output) cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
self.classifier = networks.Classification( self.classifier = networks.Classification(
...@@ -83,7 +87,11 @@ class BertClassifier(tf.keras.Model): ...@@ -83,7 +87,11 @@ class BertClassifier(tf.keras.Model):
name='sentence_prediction') name='sentence_prediction')
predictions = self.classifier(cls_output) predictions = self.classifier(cls_output)
else: else:
sequence_output, _ = network(inputs) outputs = network(inputs)
if isinstance(outputs, list):
sequence_output = outputs[0]
else:
sequence_output = outputs['sequence_output']
self.classifier = layers.ClassificationHead( self.classifier = layers.ClassificationHead(
inner_dim=sequence_output.shape[-1], inner_dim=sequence_output.shape[-1],
num_classes=num_classes, num_classes=num_classes,
......
...@@ -14,10 +14,6 @@ ...@@ -14,10 +14,6 @@
# ============================================================================== # ==============================================================================
"""Tests for BERT trainer network.""" """Tests for BERT trainer network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import parameterized from absl.testing import parameterized
import tensorflow as tf import tensorflow as tf
...@@ -31,13 +27,15 @@ from official.nlp.modeling.models import bert_classifier ...@@ -31,13 +27,15 @@ from official.nlp.modeling.models import bert_classifier
@keras_parameterized.run_all_keras_modes @keras_parameterized.run_all_keras_modes
class BertClassifierTest(keras_parameterized.TestCase): class BertClassifierTest(keras_parameterized.TestCase):
@parameterized.parameters(1, 3) @parameterized.named_parameters(('single_cls', 1, False), ('3_cls', 3, False),
def test_bert_trainer(self, num_classes): ('3_cls_dictoutputs', 3, True))
def test_bert_trainer(self, num_classes, dict_outputs):
"""Validate that the Keras object can be created.""" """Validate that the Keras object can be created."""
# Build a transformer network to use within the BERT trainer. # Build a transformer network to use within the BERT trainer.
vocab_size = 100 vocab_size = 100
sequence_length = 512 sequence_length = 512
test_network = networks.BertEncoder(vocab_size=vocab_size, num_layers=2) test_network = networks.BertEncoder(
vocab_size=vocab_size, num_layers=2, dict_outputs=dict_outputs)
# Create a BERT trainer with the created network. # Create a BERT trainer with the created network.
bert_trainer_model = bert_classifier.BertClassifier( bert_trainer_model = bert_classifier.BertClassifier(
......
...@@ -161,8 +161,9 @@ class BertPretrainerV2(tf.keras.Model): ...@@ -161,8 +161,9 @@ class BertPretrainerV2(tf.keras.Model):
name: The name of the model. name: The name of the model.
Inputs: Inputs defined by the encoder network, plus `masked_lm_positions` as a Inputs: Inputs defined by the encoder network, plus `masked_lm_positions` as a
dictionary. dictionary.
Outputs: A dictionary of `lm_output` and classification head outputs keyed by Outputs: A dictionary of `lm_output`, classification head outputs keyed by
head names. head names, and also outputs from `encoder_network`, keyed by
`pooled_output`, `sequence_output` and `encoder_outputs` (if any).
""" """
def __init__( def __init__(
...@@ -180,17 +181,32 @@ class BertPretrainerV2(tf.keras.Model): ...@@ -180,17 +181,32 @@ class BertPretrainerV2(tf.keras.Model):
'classification_heads': classification_heads, 'classification_heads': classification_heads,
'name': name, 'name': name,
} }
self.encoder_network = encoder_network self.encoder_network = encoder_network
inputs = copy.copy(self.encoder_network.inputs) inputs = copy.copy(self.encoder_network.inputs)
sequence_output, _ = self.encoder_network(inputs) outputs = dict()
encoder_network_outputs = self.encoder_network(inputs)
if isinstance(encoder_network_outputs, list):
outputs['pooled_output'] = encoder_network_outputs[1]
# When `encoder_network` was instantiated with return_all_encoder_outputs
# set to True, `encoder_network_outputs[0]` is a list containing
# all transformer layers' output.
if isinstance(encoder_network_outputs[0], list):
outputs['encoder_outputs'] = encoder_network_outputs[0]
outputs['sequence_output'] = encoder_network_outputs[0][-1]
else:
outputs['sequence_output'] = encoder_network_outputs[0]
elif isinstance(encoder_network_outputs, dict):
outputs = encoder_network_outputs
else:
raise ValueError('encoder_network\'s output should be either a list '
'or a dict, but got %s' % encoder_network_outputs)
sequence_output = outputs['sequence_output']
self.classification_heads = classification_heads or [] self.classification_heads = classification_heads or []
if len(set([cls.name for cls in self.classification_heads])) != len( if len(set([cls.name for cls in self.classification_heads])) != len(
self.classification_heads): self.classification_heads):
raise ValueError('Classification heads should have unique names.') raise ValueError('Classification heads should have unique names.')
outputs = dict()
self.masked_lm = layers.MaskedLM( self.masked_lm = layers.MaskedLM(
embedding_table=self.encoder_network.get_embedding_table(), embedding_table=self.encoder_network.get_embedding_table(),
activation=mlm_activation, activation=mlm_activation,
...@@ -199,7 +215,7 @@ class BertPretrainerV2(tf.keras.Model): ...@@ -199,7 +215,7 @@ class BertPretrainerV2(tf.keras.Model):
masked_lm_positions = tf.keras.layers.Input( masked_lm_positions = tf.keras.layers.Input(
shape=(None,), name='masked_lm_positions', dtype=tf.int32) shape=(None,), name='masked_lm_positions', dtype=tf.int32)
inputs.append(masked_lm_positions) inputs.append(masked_lm_positions)
outputs['lm_output'] = self.masked_lm( outputs['mlm_logits'] = self.masked_lm(
sequence_output, masked_positions=masked_lm_positions) sequence_output, masked_positions=masked_lm_positions)
for cls_head in self.classification_heads: for cls_head in self.classification_heads:
outputs[cls_head.name] = cls_head(sequence_output) outputs[cls_head.name] = cls_head(sequence_output)
......
...@@ -12,12 +12,10 @@ ...@@ -12,12 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Tests for BERT trainer network.""" """Tests for BERT pretrainer model."""
import itertools
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import parameterized
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
...@@ -111,15 +109,24 @@ class BertPretrainerTest(keras_parameterized.TestCase): ...@@ -111,15 +109,24 @@ class BertPretrainerTest(keras_parameterized.TestCase):
self.assertAllEqual(bert_trainer_model.get_config(), self.assertAllEqual(bert_trainer_model.get_config(),
new_bert_trainer_model.get_config()) new_bert_trainer_model.get_config())
def test_bert_pretrainerv2(self): @parameterized.parameters(itertools.product(
(False, True),
(False, True),
))
def test_bert_pretrainerv2(self, dict_outputs, return_all_encoder_outputs):
"""Validate that the Keras object can be created.""" """Validate that the Keras object can be created."""
# Build a transformer network to use within the BERT trainer. # Build a transformer network to use within the BERT trainer.
vocab_size = 100 vocab_size = 100
sequence_length = 512 sequence_length = 512
hidden_size = 48
num_layers = 2
test_network = networks.BertEncoder( test_network = networks.BertEncoder(
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=2, num_layers=num_layers,
max_sequence_length=sequence_length) hidden_size=hidden_size,
max_sequence_length=sequence_length,
return_all_encoder_outputs=return_all_encoder_outputs,
dict_outputs=dict_outputs)
# Create a BERT trainer with the created network. # Create a BERT trainer with the created network.
bert_trainer_model = bert_pretrainer.BertPretrainerV2( bert_trainer_model = bert_pretrainer.BertPretrainerV2(
...@@ -134,9 +141,28 @@ class BertPretrainerTest(keras_parameterized.TestCase): ...@@ -134,9 +141,28 @@ class BertPretrainerTest(keras_parameterized.TestCase):
# Invoke the trainer model on the inputs. This causes the layer to be built. # Invoke the trainer model on the inputs. This causes the layer to be built.
outputs = bert_trainer_model([word_ids, mask, type_ids, lm_mask]) outputs = bert_trainer_model([word_ids, mask, type_ids, lm_mask])
has_encoder_outputs = dict_outputs or return_all_encoder_outputs
if has_encoder_outputs:
self.assertSameElements(
outputs.keys(),
['sequence_output', 'pooled_output', 'mlm_logits', 'encoder_outputs'])
self.assertLen(outputs['encoder_outputs'], num_layers)
else:
self.assertSameElements(
outputs.keys(), ['sequence_output', 'pooled_output', 'mlm_logits'])
# Validate that the outputs are of the expected shape. # Validate that the outputs are of the expected shape.
expected_lm_shape = [None, num_token_predictions, vocab_size] expected_lm_shape = [None, num_token_predictions, vocab_size]
self.assertAllEqual(expected_lm_shape, outputs['lm_output'].shape.as_list()) self.assertAllEqual(expected_lm_shape,
outputs['mlm_logits'].shape.as_list())
expected_sequence_output_shape = [None, sequence_length, hidden_size]
self.assertAllEqual(expected_sequence_output_shape,
outputs['sequence_output'].shape.as_list())
expected_pooled_output_shape = [None, hidden_size]
self.assertAllEqual(expected_pooled_output_shape,
outputs['pooled_output'].shape.as_list())
def test_v2_serialize_deserialize(self): def test_v2_serialize_deserialize(self):
"""Validate that the BERT trainer can be serialized and deserialized.""" """Validate that the BERT trainer can be serialized and deserialized."""
......
...@@ -64,7 +64,11 @@ class BertSpanLabeler(tf.keras.Model): ...@@ -64,7 +64,11 @@ class BertSpanLabeler(tf.keras.Model):
# Because we have a copy of inputs to create this Model object, we can # Because we have a copy of inputs to create this Model object, we can
# invoke the Network object with its own input tensors to start the Model. # invoke the Network object with its own input tensors to start the Model.
sequence_output, _ = network(inputs) outputs = network(inputs)
if isinstance(outputs, list):
sequence_output = outputs[0]
else:
sequence_output = outputs['sequence_output']
# This is an instance variable for ease of access to the underlying task # This is an instance variable for ease of access to the underlying task
# network. # network.
......
...@@ -14,10 +14,7 @@ ...@@ -14,10 +14,7 @@
# ============================================================================== # ==============================================================================
"""Tests for BERT trainer network.""" """Tests for BERT trainer network."""
from __future__ import absolute_import from absl.testing import parameterized
from __future__ import division
from __future__ import print_function
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
...@@ -30,12 +27,14 @@ from official.nlp.modeling.models import bert_span_labeler ...@@ -30,12 +27,14 @@ from official.nlp.modeling.models import bert_span_labeler
@keras_parameterized.run_all_keras_modes @keras_parameterized.run_all_keras_modes
class BertSpanLabelerTest(keras_parameterized.TestCase): class BertSpanLabelerTest(keras_parameterized.TestCase):
def test_bert_trainer(self): @parameterized.parameters(True, False)
def test_bert_trainer(self, dict_outputs):
"""Validate that the Keras object can be created.""" """Validate that the Keras object can be created."""
# Build a transformer network to use within the BERT trainer. # Build a transformer network to use within the BERT trainer.
vocab_size = 100 vocab_size = 100
sequence_length = 512 sequence_length = 512
test_network = networks.BertEncoder(vocab_size=vocab_size, num_layers=2) test_network = networks.BertEncoder(
vocab_size=vocab_size, num_layers=2, dict_outputs=dict_outputs)
# Create a BERT trainer with the created network. # Create a BERT trainer with the created network.
bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network) bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network)
......
...@@ -67,7 +67,11 @@ class BertTokenClassifier(tf.keras.Model): ...@@ -67,7 +67,11 @@ class BertTokenClassifier(tf.keras.Model):
# Because we have a copy of inputs to create this Model object, we can # Because we have a copy of inputs to create this Model object, we can
# invoke the Network object with its own input tensors to start the Model. # invoke the Network object with its own input tensors to start the Model.
sequence_output, _ = network(inputs) outputs = network(inputs)
if isinstance(outputs, list):
sequence_output = outputs[0]
else:
sequence_output = outputs['sequence_output']
sequence_output = tf.keras.layers.Dropout(rate=dropout_rate)( sequence_output = tf.keras.layers.Dropout(rate=dropout_rate)(
sequence_output) sequence_output)
......
...@@ -12,12 +12,9 @@ ...@@ -12,12 +12,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Tests for BERT trainer network.""" """Tests for BERT token classifier."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import parameterized
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
...@@ -30,7 +27,8 @@ from official.nlp.modeling.models import bert_token_classifier ...@@ -30,7 +27,8 @@ from official.nlp.modeling.models import bert_token_classifier
@keras_parameterized.run_all_keras_modes @keras_parameterized.run_all_keras_modes
class BertTokenClassifierTest(keras_parameterized.TestCase): class BertTokenClassifierTest(keras_parameterized.TestCase):
def test_bert_trainer(self): @parameterized.parameters(True, False)
def test_bert_trainer(self, dict_outputs):
"""Validate that the Keras object can be created.""" """Validate that the Keras object can be created."""
# Build a transformer network to use within the BERT trainer. # Build a transformer network to use within the BERT trainer.
vocab_size = 100 vocab_size = 100
...@@ -38,7 +36,8 @@ class BertTokenClassifierTest(keras_parameterized.TestCase): ...@@ -38,7 +36,8 @@ class BertTokenClassifierTest(keras_parameterized.TestCase):
test_network = networks.BertEncoder( test_network = networks.BertEncoder(
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=2, num_layers=2,
max_sequence_length=sequence_length) max_sequence_length=sequence_length,
dict_outputs=dict_outputs)
# Create a BERT trainer with the created network. # Create a BERT trainer with the created network.
num_classes = 3 num_classes = 3
......
...@@ -14,12 +14,7 @@ ...@@ -14,12 +14,7 @@
# ============================================================================== # ==============================================================================
"""Trainer network for dual encoder style models.""" """Trainer network for dual encoder style models."""
# pylint: disable=g-classes-have-attributes # pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
# Import libraries
import tensorflow as tf import tensorflow as tf
from official.nlp.modeling import layers from official.nlp.modeling import layers
...@@ -84,11 +79,16 @@ class DualEncoder(tf.keras.Model): ...@@ -84,11 +79,16 @@ class DualEncoder(tf.keras.Model):
shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids') shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
left_inputs = [left_word_ids, left_mask, left_type_ids] left_inputs = [left_word_ids, left_mask, left_type_ids]
left_sequence_output, left_encoded = network(left_inputs) left_outputs = network(left_inputs)
if isinstance(left_outputs, list):
left_sequence_output, left_encoded = left_outputs
else:
left_sequence_output = left_outputs['sequence_output']
left_encoded = left_outputs['pooled_output']
if normalize: if normalize:
left_encoded = tf.keras.layers.Lambda( left_encoded = tf.keras.layers.Lambda(
lambda x: tf.nn.l2_normalize(x, axis=1))(left_encoded) lambda x: tf.nn.l2_normalize(x, axis=1))(
left_encoded)
if output == 'logits': if output == 'logits':
right_word_ids = tf.keras.layers.Input( right_word_ids = tf.keras.layers.Input(
...@@ -99,33 +99,40 @@ class DualEncoder(tf.keras.Model): ...@@ -99,33 +99,40 @@ class DualEncoder(tf.keras.Model):
shape=(max_seq_length,), dtype=tf.int32, name='right_type_ids') shape=(max_seq_length,), dtype=tf.int32, name='right_type_ids')
right_inputs = [right_word_ids, right_mask, right_type_ids] right_inputs = [right_word_ids, right_mask, right_type_ids]
_, right_encoded = network(right_inputs) right_outputs = network(right_inputs)
if isinstance(right_outputs, list):
_, right_encoded = right_outputs
else:
right_encoded = right_outputs['pooled_output']
if normalize: if normalize:
right_encoded = tf.keras.layers.Lambda( right_encoded = tf.keras.layers.Lambda(
lambda x: tf.nn.l2_normalize(x, axis=1))(right_encoded) lambda x: tf.nn.l2_normalize(x, axis=1))(
right_encoded)
dot_products = layers.MatMulWithMargin(logit_scale=logit_scale,
logit_margin=logit_margin, dot_products = layers.MatMulWithMargin(
name='dot_product') logit_scale=logit_scale,
logit_margin=logit_margin,
inputs = [left_word_ids, left_mask, left_type_ids, right_word_ids, name='dot_product')
right_mask, right_type_ids]
inputs = [
left_word_ids, left_mask, left_type_ids, right_word_ids, right_mask,
right_type_ids
]
left_logits, right_logits = dot_products(left_encoded, right_encoded) left_logits, right_logits = dot_products(left_encoded, right_encoded)
outputs = [left_logits, right_logits] outputs = dict(left_logits=left_logits, right_logits=right_logits)
elif output == 'predictions': elif output == 'predictions':
inputs = [left_word_ids, left_mask, left_type_ids] inputs = [left_word_ids, left_mask, left_type_ids]
# To keep consistent with legacy BERT hub modules, the outputs are # To keep consistent with legacy BERT hub modules, the outputs are
# "pooled_output" and "sequence_output". # "pooled_output" and "sequence_output".
outputs = [left_encoded, left_sequence_output] outputs = dict(
sequence_output=left_sequence_output, pooled_output=left_encoded)
else: else:
raise ValueError('output type %s is not supported' % output) raise ValueError('output type %s is not supported' % output)
super(DualEncoder, self).__init__( super(DualEncoder, self).__init__(inputs=inputs, outputs=outputs, **kwargs)
inputs=inputs, outputs=outputs, **kwargs)
# Set _self_setattr_tracking to True so it can be exported with assets. # Set _self_setattr_tracking to True so it can be exported with assets.
self._self_setattr_tracking = True self._self_setattr_tracking = True
......
...@@ -14,11 +14,6 @@ ...@@ -14,11 +14,6 @@
# ============================================================================== # ==============================================================================
"""Tests for dual encoder network.""" """Tests for dual encoder network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# Import libraries
from absl.testing import parameterized from absl.testing import parameterized
import tensorflow as tf import tensorflow as tf
...@@ -42,7 +37,8 @@ class DualEncoderTest(keras_parameterized.TestCase): ...@@ -42,7 +37,8 @@ class DualEncoderTest(keras_parameterized.TestCase):
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=2, num_layers=2,
hidden_size=hidden_size, hidden_size=hidden_size,
sequence_length=sequence_length) sequence_length=sequence_length,
dict_outputs=True)
# Create a dual encoder model with the created network. # Create a dual encoder model with the created network.
dual_encoder_model = dual_encoder.DualEncoder( dual_encoder_model = dual_encoder.DualEncoder(
...@@ -59,21 +55,19 @@ class DualEncoderTest(keras_parameterized.TestCase): ...@@ -59,21 +55,19 @@ class DualEncoderTest(keras_parameterized.TestCase):
if output == 'logits': if output == 'logits':
outputs = dual_encoder_model([ outputs = dual_encoder_model([
left_word_ids, left_mask, left_type_ids, left_word_ids, left_mask, left_type_ids, right_word_ids, right_mask,
right_word_ids, right_mask, right_type_ids]) right_type_ids
])
left_encoded, _ = outputs _ = outputs['left_logits']
elif output == 'predictions': elif output == 'predictions':
left_encoded, left_sequence_output = dual_encoder_model([ outputs = dual_encoder_model([left_word_ids, left_mask, left_type_ids])
left_word_ids, left_mask, left_type_ids])
# Validate that the outputs are of the expected shape. # Validate that the outputs are of the expected shape.
expected_encoding_shape = [None, 768]
self.assertAllEqual(expected_encoding_shape, left_encoded.shape.as_list())
expected_sequence_shape = [None, sequence_length, 768] expected_sequence_shape = [None, sequence_length, 768]
self.assertAllEqual(expected_sequence_shape, self.assertAllEqual(expected_sequence_shape,
left_sequence_output.shape.as_list()) outputs['sequence_output'].shape.as_list())
left_encoded = outputs['pooled_output']
expected_encoding_shape = [None, 768]
self.assertAllEqual(expected_encoding_shape, left_encoded.shape.as_list())
@parameterized.parameters((192, 'logits'), (768, 'predictions')) @parameterized.parameters((192, 'logits'), (768, 'predictions'))
def test_dual_encoder_tensor_call(self, hidden_size, output): def test_dual_encoder_tensor_call(self, hidden_size, output):
......
...@@ -14,10 +14,6 @@ ...@@ -14,10 +14,6 @@
# ============================================================================== # ==============================================================================
"""Trainer network for ELECTRA models.""" """Trainer network for ELECTRA models."""
# pylint: disable=g-classes-have-attributes # pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import copy import copy
...@@ -139,14 +135,11 @@ class ElectraPretrainer(tf.keras.Model): ...@@ -139,14 +135,11 @@ class ElectraPretrainer(tf.keras.Model):
masked_lm_positions = inputs['masked_lm_positions'] masked_lm_positions = inputs['masked_lm_positions']
### Generator ### ### Generator ###
sequence_output, cls_output = self.generator_network( sequence_output = self.generator_network(
[input_word_ids, input_mask, input_type_ids]) [input_word_ids, input_mask, input_type_ids])['sequence_output']
# The generator encoder network may get outputs from all layers. # The generator encoder network may get outputs from all layers.
if isinstance(sequence_output, list): if isinstance(sequence_output, list):
sequence_output = sequence_output[-1] sequence_output = sequence_output[-1]
if isinstance(cls_output, list):
cls_output = cls_output[-1]
lm_outputs = self.masked_lm(sequence_output, masked_lm_positions) lm_outputs = self.masked_lm(sequence_output, masked_lm_positions)
sentence_outputs = self.classification(sequence_output) sentence_outputs = self.classification(sequence_output)
...@@ -157,10 +150,10 @@ class ElectraPretrainer(tf.keras.Model): ...@@ -157,10 +150,10 @@ class ElectraPretrainer(tf.keras.Model):
### Discriminator ### ### Discriminator ###
disc_input = fake_data['inputs'] disc_input = fake_data['inputs']
disc_label = fake_data['is_fake_tokens'] disc_label = fake_data['is_fake_tokens']
disc_sequence_output, _ = self.discriminator_network([ disc_sequence_output = self.discriminator_network([
disc_input['input_word_ids'], disc_input['input_mask'], disc_input['input_word_ids'], disc_input['input_mask'],
disc_input['input_type_ids'] disc_input['input_type_ids']
]) ])['sequence_output']
# The discriminator encoder network may get outputs from all layers. # The discriminator encoder network may get outputs from all layers.
if isinstance(disc_sequence_output, list): if isinstance(disc_sequence_output, list):
......
...@@ -14,10 +14,6 @@ ...@@ -14,10 +14,6 @@
# ============================================================================== # ==============================================================================
"""Tests for ELECTRA pre trainer network.""" """Tests for ELECTRA pre trainer network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
...@@ -38,11 +34,13 @@ class ElectraPretrainerTest(keras_parameterized.TestCase): ...@@ -38,11 +34,13 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
test_generator_network = networks.BertEncoder( test_generator_network = networks.BertEncoder(
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=2, num_layers=2,
max_sequence_length=sequence_length) max_sequence_length=sequence_length,
dict_outputs=True)
test_discriminator_network = networks.BertEncoder( test_discriminator_network = networks.BertEncoder(
vocab_size=vocab_size, vocab_size=vocab_size,
num_layers=2, num_layers=2,
max_sequence_length=sequence_length) max_sequence_length=sequence_length,
dict_outputs=True)
# Create a ELECTRA trainer with the created network. # Create a ELECTRA trainer with the created network.
num_classes = 3 num_classes = 3
...@@ -92,9 +90,9 @@ class ElectraPretrainerTest(keras_parameterized.TestCase): ...@@ -92,9 +90,9 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
# Build a transformer network to use within the ELECTRA trainer. (Here, we # Build a transformer network to use within the ELECTRA trainer. (Here, we
# use a short sequence_length for convenience.) # use a short sequence_length for convenience.)
test_generator_network = networks.BertEncoder( test_generator_network = networks.BertEncoder(
vocab_size=100, num_layers=4, max_sequence_length=3) vocab_size=100, num_layers=4, max_sequence_length=3, dict_outputs=True)
test_discriminator_network = networks.BertEncoder( test_discriminator_network = networks.BertEncoder(
vocab_size=100, num_layers=4, max_sequence_length=3) vocab_size=100, num_layers=4, max_sequence_length=3, dict_outputs=True)
# Create a ELECTRA trainer with the created network. # Create a ELECTRA trainer with the created network.
eletrca_trainer_model = electra_pretrainer.ElectraPretrainer( eletrca_trainer_model = electra_pretrainer.ElectraPretrainer(
......
...@@ -142,12 +142,12 @@ class Seq2SeqTransformer(tf.keras.Model): ...@@ -142,12 +142,12 @@ class Seq2SeqTransformer(tf.keras.Model):
self._beam_size = beam_size self._beam_size = beam_size
self._alpha = alpha self._alpha = alpha
self._dtype = dtype self._dtype = dtype
self.embedding_lookup = layers.OnDeviceEmbedding( self.embedding_lookup = keras_nlp.layers.OnDeviceEmbedding(
vocab_size=self._vocab_size, vocab_size=self._vocab_size,
embedding_width=self._embedding_width, embedding_width=self._embedding_width,
initializer=tf.random_normal_initializer( initializer=tf.random_normal_initializer(
mean=0., stddev=self._embedding_width**-0.5), mean=0., stddev=self._embedding_width**-0.5),
use_scale=True) scale_factor=self._embedding_width**0.5)
self.encoder_layer = encoder_layer self.encoder_layer = encoder_layer
self.decoder_layer = decoder_layer self.decoder_layer = decoder_layer
self.position_embedding = layers.RelativePositionEmbedding( self.position_embedding = layers.RelativePositionEmbedding(
...@@ -472,7 +472,7 @@ class TransformerEncoder(tf.keras.layers.Layer): ...@@ -472,7 +472,7 @@ class TransformerEncoder(tf.keras.layers.Layer):
self.encoder_layers = [] self.encoder_layers = []
for i in range(self.num_layers): for i in range(self.num_layers):
self.encoder_layers.append( self.encoder_layers.append(
keras_nlp.TransformerEncoderBlock( keras_nlp.layers.TransformerEncoderBlock(
num_attention_heads=self.num_attention_heads, num_attention_heads=self.num_attention_heads,
inner_dim=self._intermediate_size, inner_dim=self._intermediate_size,
inner_activation=self._activation, inner_activation=self._activation,
...@@ -581,7 +581,7 @@ class TransformerDecoder(tf.keras.layers.Layer): ...@@ -581,7 +581,7 @@ class TransformerDecoder(tf.keras.layers.Layer):
self.decoder_layers = [] self.decoder_layers = []
for i in range(self.num_layers): for i in range(self.num_layers):
self.decoder_layers.append( self.decoder_layers.append(
layers.TransformerDecoderLayer( layers.TransformerDecoderBlock(
num_attention_heads=self.num_attention_heads, num_attention_heads=self.num_attention_heads,
intermediate_size=self._intermediate_size, intermediate_size=self._intermediate_size,
intermediate_activation=self._activation, intermediate_activation=self._activation,
......
...@@ -10,7 +10,7 @@ Transformer-based encoder as described in ["BERT: Pre-training of Deep ...@@ -10,7 +10,7 @@ Transformer-based encoder as described in ["BERT: Pre-training of Deep
Bidirectional Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding lookups, Bidirectional Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding lookups,
transformer layers and pooling layer. transformer layers and pooling layer.
* [`AlbertTransformerEncoder`](albert_transformer_encoder.py) implements a * [`AlbertEncoder`](albert_encoder.py) implements a
Transformer-encoder described in the paper ["ALBERT: A Lite BERT for Transformer-encoder described in the paper ["ALBERT: A Lite BERT for
Self-supervised Learning of Language Representations"] Self-supervised Learning of Language Representations"]
(https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805), ALBERT refactorizes embedding parameters (https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805), ALBERT refactorizes embedding parameters
...@@ -26,3 +26,4 @@ to 1) head. ...@@ -26,3 +26,4 @@ to 1) head.
* [`SpanLabeling`](span_labeling.py) implements a single-span labeler (that is, a prediction head that can predict one start and end index per batch item) based on a single dense hidden layer. It can be used in the SQuAD task. * [`SpanLabeling`](span_labeling.py) implements a single-span labeler (that is, a prediction head that can predict one start and end index per batch item) based on a single dense hidden layer. It can be used in the SQuAD task.
* [`XLNetBase`](xlnet_base.py) implements the base network used in "XLNet: Generalized Autoregressive Pretraining for Language Understanding"(https://arxiv.org/abs/1906.08237). It includes embedding lookups, relative position encodings, mask computations, segment matrix computations and Transformer XL layers using one or two stream relative self-attention.
...@@ -13,11 +13,12 @@ ...@@ -13,11 +13,12 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Networks package definition.""" """Networks package definition."""
from official.nlp.modeling.networks.albert_transformer_encoder import AlbertTransformerEncoder from official.nlp.modeling.networks.albert_encoder import AlbertEncoder
from official.nlp.modeling.networks.bert_encoder import BertEncoder from official.nlp.modeling.networks.bert_encoder import BertEncoder
from official.nlp.modeling.networks.classification import Classification from official.nlp.modeling.networks.classification import Classification
from official.nlp.modeling.networks.encoder_scaffold import EncoderScaffold from official.nlp.modeling.networks.encoder_scaffold import EncoderScaffold
from official.nlp.modeling.networks.mobile_bert_encoder import MobileBERTEncoder from official.nlp.modeling.networks.mobile_bert_encoder import MobileBERTEncoder
from official.nlp.modeling.networks.span_labeling import SpanLabeling from official.nlp.modeling.networks.span_labeling import SpanLabeling
from official.nlp.modeling.networks.xlnet_base import XLNetBase
# Backward compatibility. The modules are deprecated. # Backward compatibility. The modules are deprecated.
TransformerEncoder = BertEncoder TransformerEncoder = BertEncoder
...@@ -14,10 +14,6 @@ ...@@ -14,10 +14,6 @@
# ============================================================================== # ==============================================================================
"""ALBERT (https://arxiv.org/abs/1810.04805) text encoder network.""" """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network."""
# pylint: disable=g-classes-have-attributes # pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import tensorflow as tf import tensorflow as tf
...@@ -27,7 +23,7 @@ from official.nlp.modeling import layers ...@@ -27,7 +23,7 @@ from official.nlp.modeling import layers
@tf.keras.utils.register_keras_serializable(package='Text') @tf.keras.utils.register_keras_serializable(package='Text')
class AlbertTransformerEncoder(tf.keras.Model): class AlbertEncoder(tf.keras.Model):
"""ALBERT (https://arxiv.org/abs/1810.04805) text encoder network. """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network.
This network implements the encoder described in the paper "ALBERT: A Lite This network implements the encoder described in the paper "ALBERT: A Lite
...@@ -64,6 +60,7 @@ class AlbertTransformerEncoder(tf.keras.Model): ...@@ -64,6 +60,7 @@ class AlbertTransformerEncoder(tf.keras.Model):
attention_dropout_rate: The dropout rate to use for the attention layers attention_dropout_rate: The dropout rate to use for the attention layers
within the transformer layers. within the transformer layers.
initializer: The initialzer to use for all weights in this encoder. initializer: The initialzer to use for all weights in this encoder.
dict_outputs: Whether to use a dictionary as the model outputs.
""" """
def __init__(self, def __init__(self,
...@@ -79,6 +76,7 @@ class AlbertTransformerEncoder(tf.keras.Model): ...@@ -79,6 +76,7 @@ class AlbertTransformerEncoder(tf.keras.Model):
dropout_rate=0.1, dropout_rate=0.1,
attention_dropout_rate=0.1, attention_dropout_rate=0.1,
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
dict_outputs=False,
**kwargs): **kwargs):
activation = tf.keras.activations.get(activation) activation = tf.keras.activations.get(activation)
initializer = tf.keras.initializers.get(initializer) initializer = tf.keras.initializers.get(initializer)
...@@ -116,7 +114,7 @@ class AlbertTransformerEncoder(tf.keras.Model): ...@@ -116,7 +114,7 @@ class AlbertTransformerEncoder(tf.keras.Model):
word_embeddings = self._embedding_layer(word_ids) word_embeddings = self._embedding_layer(word_ids)
# Always uses dynamic slicing for simplicity. # Always uses dynamic slicing for simplicity.
self._position_embedding_layer = keras_nlp.PositionEmbedding( self._position_embedding_layer = keras_nlp.layers.PositionEmbedding(
initializer=initializer, initializer=initializer,
max_length=max_sequence_length, max_length=max_sequence_length,
name='position_embedding') name='position_embedding')
...@@ -152,7 +150,7 @@ class AlbertTransformerEncoder(tf.keras.Model): ...@@ -152,7 +150,7 @@ class AlbertTransformerEncoder(tf.keras.Model):
data = embeddings data = embeddings
attention_mask = layers.SelfAttentionMask()([data, mask]) attention_mask = layers.SelfAttentionMask()([data, mask])
shared_layer = keras_nlp.TransformerEncoderBlock( shared_layer = keras_nlp.layers.TransformerEncoderBlock(
num_attention_heads=num_attention_heads, num_attention_heads=num_attention_heads,
inner_dim=intermediate_size, inner_dim=intermediate_size,
inner_activation=activation, inner_activation=activation,
...@@ -160,8 +158,10 @@ class AlbertTransformerEncoder(tf.keras.Model): ...@@ -160,8 +158,10 @@ class AlbertTransformerEncoder(tf.keras.Model):
attention_dropout=attention_dropout_rate, attention_dropout=attention_dropout_rate,
kernel_initializer=initializer, kernel_initializer=initializer,
name='transformer') name='transformer')
encoder_outputs = []
for _ in range(num_layers): for _ in range(num_layers):
data = shared_layer([data, attention_mask]) data = shared_layer([data, attention_mask])
encoder_outputs.append(data)
first_token_tensor = ( first_token_tensor = (
tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data) tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data)
...@@ -172,9 +172,17 @@ class AlbertTransformerEncoder(tf.keras.Model): ...@@ -172,9 +172,17 @@ class AlbertTransformerEncoder(tf.keras.Model):
kernel_initializer=initializer, kernel_initializer=initializer,
name='pooler_transform')( name='pooler_transform')(
first_token_tensor) first_token_tensor)
if dict_outputs:
super(AlbertTransformerEncoder, self).__init__( outputs = dict(
inputs=[word_ids, mask, type_ids], outputs=[data, cls_output], **kwargs) sequence_output=data,
encoder_outputs=encoder_outputs,
pooled_output=cls_output,
)
else:
outputs = [data, cls_output]
super(AlbertEncoder, self).__init__(
inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
def get_embedding_table(self): def get_embedding_table(self):
return self._embedding_layer.embeddings return self._embedding_layer.embeddings
......
...@@ -23,16 +23,16 @@ import numpy as np ...@@ -23,16 +23,16 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.networks import albert_transformer_encoder from official.nlp.modeling.networks import albert_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover. # guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes @keras_parameterized.run_all_keras_modes
class AlbertTransformerEncoderTest(keras_parameterized.TestCase): class AlbertEncoderTest(keras_parameterized.TestCase):
def tearDown(self): def tearDown(self):
super(AlbertTransformerEncoderTest, self).tearDown() super(AlbertEncoderTest, self).tearDown()
tf.keras.mixed_precision.experimental.set_policy("float32") tf.keras.mixed_precision.experimental.set_policy("float32")
@parameterized.named_parameters( @parameterized.named_parameters(
...@@ -52,7 +52,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase): ...@@ -52,7 +52,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
tf.keras.mixed_precision.experimental.set_policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
# Create a small TransformerEncoder for testing. # Create a small TransformerEncoder for testing.
test_network = albert_transformer_encoder.AlbertTransformerEncoder(**kwargs) test_network = albert_encoder.AlbertEncoder(**kwargs)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
...@@ -84,13 +84,14 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase): ...@@ -84,13 +84,14 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
sequence_length = 21 sequence_length = 21
vocab_size = 57 vocab_size = 57
num_types = 7 num_types = 7
num_layers = 3
# Create a small TransformerEncoder for testing. # Create a small TransformerEncoder for testing.
test_network = albert_transformer_encoder.AlbertTransformerEncoder( test_network = albert_encoder.AlbertEncoder(
vocab_size=vocab_size, vocab_size=vocab_size,
embedding_width=8, embedding_width=8,
hidden_size=hidden_size, hidden_size=hidden_size,
num_attention_heads=2, num_attention_heads=2,
num_layers=3, num_layers=num_layers,
type_vocab_size=num_types) type_vocab_size=num_types)
# Create the inputs (note that the first dimension is implicit). # Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
...@@ -109,21 +110,43 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase): ...@@ -109,21 +110,43 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
mask_data = np.random.randint(2, size=(batch_size, sequence_length)) mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint( type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length)) num_types, size=(batch_size, sequence_length))
_ = model.predict([word_id_data, mask_data, type_id_data]) list_outputs = model.predict([word_id_data, mask_data, type_id_data])
# Creates a TransformerEncoder with max_sequence_length != sequence_length # Creates a TransformerEncoder with max_sequence_length != sequence_length
max_sequence_length = 128 max_sequence_length = 128
test_network = albert_transformer_encoder.AlbertTransformerEncoder( test_network = albert_encoder.AlbertEncoder(
vocab_size=vocab_size, vocab_size=vocab_size,
embedding_width=8, embedding_width=8,
hidden_size=hidden_size, hidden_size=hidden_size,
max_sequence_length=max_sequence_length, max_sequence_length=max_sequence_length,
num_attention_heads=2, num_attention_heads=2,
num_layers=3, num_layers=num_layers,
type_vocab_size=num_types) type_vocab_size=num_types)
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled]) model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
_ = model.predict([word_id_data, mask_data, type_id_data]) _ = model.predict([word_id_data, mask_data, type_id_data])
# Tests dictionary outputs.
test_network_dict = albert_encoder.AlbertEncoder(
vocab_size=vocab_size,
embedding_width=8,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=num_layers,
type_vocab_size=num_types,
dict_outputs=True)
_ = test_network_dict([word_ids, mask, type_ids])
test_network_dict.set_weights(test_network.get_weights())
list_outputs = test_network([word_id_data, mask_data, type_id_data])
dict_outputs = test_network_dict(
dict(
input_word_ids=word_id_data,
input_mask=mask_data,
input_type_ids=type_id_data))
self.assertAllEqual(list_outputs[0], dict_outputs["sequence_output"])
self.assertAllEqual(list_outputs[1], dict_outputs["pooled_output"])
self.assertLen(dict_outputs["pooled_output"], num_layers)
def test_serialize_deserialize(self): def test_serialize_deserialize(self):
tf.keras.mixed_precision.experimental.set_policy("mixed_float16") tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
# Create a network object that sets all of its config options. # Create a network object that sets all of its config options.
...@@ -140,7 +163,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase): ...@@ -140,7 +163,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
dropout_rate=0.05, dropout_rate=0.05,
attention_dropout_rate=0.22, attention_dropout_rate=0.22,
initializer="glorot_uniform") initializer="glorot_uniform")
network = albert_transformer_encoder.AlbertTransformerEncoder(**kwargs) network = albert_encoder.AlbertEncoder(**kwargs)
expected_config = dict(kwargs) expected_config = dict(kwargs)
expected_config["activation"] = tf.keras.activations.serialize( expected_config["activation"] = tf.keras.activations.serialize(
...@@ -151,7 +174,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase): ...@@ -151,7 +174,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
# Create another network object from the first object's config. # Create another network object from the first object's config.
new_network = ( new_network = (
albert_transformer_encoder.AlbertTransformerEncoder.from_config( albert_encoder.AlbertEncoder.from_config(
network.get_config())) network.get_config()))
# Validate that the config can be forced to JSON. # Validate that the config can be forced to JSON.
......
...@@ -19,11 +19,13 @@ import tensorflow as tf ...@@ -19,11 +19,13 @@ import tensorflow as tf
from official.modeling import activations from official.modeling import activations
from official.nlp import keras_nlp from official.nlp import keras_nlp
from official.nlp.modeling import layers
# This class is being replaced by keras_nlp.encoders.BertEncoder and merely
# acts as a wrapper if you need: 1) list outputs instead of dict outputs,
# 2) shared embedding layer.
@tf.keras.utils.register_keras_serializable(package='Text') @tf.keras.utils.register_keras_serializable(package='Text')
class BertEncoder(tf.keras.Model): class BertEncoder(keras_nlp.encoders.BertEncoder):
"""Bi-directional Transformer-based encoder network. """Bi-directional Transformer-based encoder network.
This network implements a bi-directional Transformer-based encoder as This network implements a bi-directional Transformer-based encoder as
...@@ -58,7 +60,9 @@ class BertEncoder(tf.keras.Model): ...@@ -58,7 +60,9 @@ class BertEncoder(tf.keras.Model):
within the transformer layers. within the transformer layers.
initializer: The initialzer to use for all weights in this encoder. initializer: The initialzer to use for all weights in this encoder.
return_all_encoder_outputs: Whether to output sequence embedding outputs of return_all_encoder_outputs: Whether to output sequence embedding outputs of
all encoder transformer layers. all encoder transformer layers. Note: when the following `dict_outputs`
argument is True, all encoder outputs are always returned in the dict,
keyed by `encoder_outputs`.
output_range: The sequence output range, [0, output_range), by slicing the output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yeilds the full target sequence will attend to the source sequence, which yeilds the full
...@@ -72,6 +76,7 @@ class BertEncoder(tf.keras.Model): ...@@ -72,6 +76,7 @@ class BertEncoder(tf.keras.Model):
embedding layer. Otherwise, we will reuse the given embedding layer. This embedding layer. Otherwise, we will reuse the given embedding layer. This
parameter is originally added for ELECTRA model which needs to tie the parameter is originally added for ELECTRA model which needs to tie the
generator embeddings with the discriminator embeddings. generator embeddings with the discriminator embeddings.
dict_outputs: Whether to use a dictionary as the model outputs.
""" """
def __init__(self, def __init__(self,
...@@ -91,140 +96,53 @@ class BertEncoder(tf.keras.Model): ...@@ -91,140 +96,53 @@ class BertEncoder(tf.keras.Model):
output_range=None, output_range=None,
embedding_width=None, embedding_width=None,
embedding_layer=None, embedding_layer=None,
dict_outputs=False,
**kwargs): **kwargs):
activation = tf.keras.activations.get(activation)
initializer = tf.keras.initializers.get(initializer)
self._self_setattr_tracking = False self._self_setattr_tracking = False
self._config_dict = { self._embedding_layer_instance = embedding_layer
'vocab_size': vocab_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'num_attention_heads': num_attention_heads,
'max_sequence_length': max_sequence_length,
'type_vocab_size': type_vocab_size,
'intermediate_size': intermediate_size,
'activation': tf.keras.activations.serialize(activation),
'dropout_rate': dropout_rate,
'attention_dropout_rate': attention_dropout_rate,
'initializer': tf.keras.initializers.serialize(initializer),
'return_all_encoder_outputs': return_all_encoder_outputs,
'output_range': output_range,
'embedding_width': embedding_width,
}
word_ids = tf.keras.layers.Input( super(BertEncoder, self).__init__(
shape=(None,), dtype=tf.int32, name='input_word_ids') vocab_size=vocab_size,
mask = tf.keras.layers.Input( hidden_size=hidden_size,
shape=(None,), dtype=tf.int32, name='input_mask') num_layers=num_layers,
type_ids = tf.keras.layers.Input( num_attention_heads=num_attention_heads,
shape=(None,), dtype=tf.int32, name='input_type_ids') max_sequence_length=max_sequence_length,
type_vocab_size=type_vocab_size,
if embedding_width is None: inner_dim=intermediate_size,
embedding_width = hidden_size inner_activation=activation,
if embedding_layer is None: output_dropout=dropout_rate,
self._embedding_layer = layers.OnDeviceEmbedding( attention_dropout=attention_dropout_rate,
vocab_size=vocab_size,
embedding_width=embedding_width,
initializer=initializer,
name='word_embeddings')
else:
self._embedding_layer = embedding_layer
word_embeddings = self._embedding_layer(word_ids)
# Always uses dynamic slicing for simplicity.
self._position_embedding_layer = keras_nlp.PositionEmbedding(
initializer=initializer,
max_length=max_sequence_length,
name='position_embedding')
position_embeddings = self._position_embedding_layer(word_embeddings)
self._type_embedding_layer = layers.OnDeviceEmbedding(
vocab_size=type_vocab_size,
embedding_width=embedding_width,
initializer=initializer, initializer=initializer,
use_one_hot=True, output_range=output_range,
name='type_embeddings') embedding_width=embedding_width)
type_embeddings = self._type_embedding_layer(type_ids)
# Replace arguments from keras_nlp.encoders.BertEncoder.
embeddings = tf.keras.layers.Add()( self._config_dict['activation'] = self._config_dict.pop('inner_activation')
[word_embeddings, position_embeddings, type_embeddings]) self._config_dict['intermediate_size'] = self._config_dict.pop('inner_dim')
self._config_dict['dropout_rate'] = self._config_dict.pop('output_dropout')
self._embedding_norm_layer = tf.keras.layers.LayerNormalization( self._config_dict['attention_dropout_rate'] = self._config_dict.pop(
name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32) 'attention_dropout')
self._config_dict['dict_outputs'] = dict_outputs
embeddings = self._embedding_norm_layer(embeddings) self._config_dict['return_all_encoder_outputs'] = return_all_encoder_outputs
embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
if dict_outputs:
# We project the 'embedding' output to 'hidden_size' if it is not already return
# 'hidden_size'. else:
if embedding_width != hidden_size: nested_output = self._nested_outputs
self._embedding_projection = tf.keras.layers.experimental.EinsumDense( cls_output = nested_output['pooled_output']
'...x,xy->...y', if return_all_encoder_outputs:
output_shape=hidden_size, encoder_outputs = nested_output['encoder_outputs']
bias_axes='y', outputs = [encoder_outputs, cls_output]
kernel_initializer=initializer,
name='embedding_projection')
embeddings = self._embedding_projection(embeddings)
self._transformer_layers = []
data = embeddings
attention_mask = layers.SelfAttentionMask()([data, mask])
encoder_outputs = []
for i in range(num_layers):
if i == num_layers - 1 and output_range is not None:
transformer_output_range = output_range
else: else:
transformer_output_range = None sequence_output = nested_output['sequence_output']
layer = keras_nlp.layers.TransformerEncoderBlock( outputs = [sequence_output, cls_output]
num_attention_heads=num_attention_heads, super(keras_nlp.encoders.BertEncoder, self).__init__(
inner_dim=intermediate_size, inputs=self.inputs, outputs=outputs, **kwargs)
inner_activation=activation,
output_dropout=dropout_rate, # Override method for shared embedding use case.
attention_dropout=attention_dropout_rate, def _build_embedding_layer(self):
output_range=transformer_output_range, if self._embedding_layer_instance is None:
kernel_initializer=initializer, return super(BertEncoder, self)._build_embedding_layer()
name='transformer/layer_%d' % i)
self._transformer_layers.append(layer)
data = layer([data, attention_mask])
encoder_outputs.append(data)
first_token_tensor = (
tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
encoder_outputs[-1]))
self._pooler_layer = tf.keras.layers.Dense(
units=hidden_size,
activation='tanh',
kernel_initializer=initializer,
name='pooler_transform')
cls_output = self._pooler_layer(first_token_tensor)
if return_all_encoder_outputs:
outputs = [encoder_outputs, cls_output]
else: else:
outputs = [encoder_outputs[-1], cls_output] return self._embedding_layer_instance
super(BertEncoder, self).__init__(
inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
def get_embedding_table(self):
return self._embedding_layer.embeddings
def get_embedding_layer(self):
return self._embedding_layer
def get_config(self):
return self._config_dict
@property
def transformer_layers(self):
"""List of Transformer layers in the encoder."""
return self._transformer_layers
@property
def pooler_layer(self):
"""The pooler dense layer after the transformer layers."""
return self._pooler_layer
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
...@@ -12,11 +12,7 @@ ...@@ -12,11 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Tests for transformer-based text encoder network.""" """Tests for transformer-based bert encoder network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# Import libraries # Import libraries
from absl.testing import parameterized from absl.testing import parameterized
...@@ -64,6 +60,35 @@ class BertEncoderTest(keras_parameterized.TestCase): ...@@ -64,6 +60,35 @@ class BertEncoderTest(keras_parameterized.TestCase):
self.assertAllEqual(tf.float32, data.dtype) self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float32, pooled.dtype) self.assertAllEqual(tf.float32, pooled.dtype)
test_network_dict = bert_encoder.BertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
dict_outputs=True)
# Create the inputs (note that the first dimension is implicit).
inputs = dict(
input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids)
_ = test_network_dict(inputs)
test_network_dict.set_weights(test_network.get_weights())
batch_size = 2
vocab_size = 100
num_types = 2
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length))
list_outputs = test_network([word_id_data, mask_data, type_id_data])
dict_outputs = test_network_dict(
dict(
input_word_ids=word_id_data,
input_mask=mask_data,
input_type_ids=type_id_data))
self.assertAllEqual(list_outputs[0], dict_outputs["sequence_output"])
self.assertAllEqual(list_outputs[1], dict_outputs["pooled_output"])
def test_all_encoder_outputs_network_creation(self): def test_all_encoder_outputs_network_creation(self):
hidden_size = 32 hidden_size = 32
sequence_length = 21 sequence_length = 21
...@@ -199,7 +224,8 @@ class BertEncoderTest(keras_parameterized.TestCase): ...@@ -199,7 +224,8 @@ class BertEncoderTest(keras_parameterized.TestCase):
initializer="glorot_uniform", initializer="glorot_uniform",
return_all_encoder_outputs=False, return_all_encoder_outputs=False,
output_range=-1, output_range=-1,
embedding_width=16) embedding_width=16,
dict_outputs=True)
network = bert_encoder.BertEncoder(**kwargs) network = bert_encoder.BertEncoder(**kwargs)
expected_config = dict(kwargs) expected_config = dict(kwargs)
expected_config["activation"] = tf.keras.activations.serialize( expected_config["activation"] = tf.keras.activations.serialize(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment