Commit b0ccdb11 authored by Shixin Luo's avatar Shixin Luo
Browse files

resolve conflict with master

parents e61588cd 1611a8c5
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Transformer XL."""
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import transformer_xl
def create_mock_transformer_xl_data(
batch_size,
num_heads,
head_size,
hidden_size,
seq_length,
memory_length=0,
num_predictions=2,
two_stream=False,
num_layers=1,
include_biases=True,
include_state=False,
include_mask=False,
include_segment=False):
"""Creates mock testing data.
Args:
batch_size: `int`, the batch size.
num_heads: `int`, number of attention heads.
head_size: `int`, the size of each attention head.
hidden_size: `int`, the layer's hidden size.
seq_length: `int`, Sequence length of the input.
memory_length: optional `int`, the length of the state. Defaults to 0.
num_predictions: `int`, the number of predictions used in two stream
attention.
two_stream: `bool`, whether or not to generate two stream data.
num_layers: `int`, the number of Transformer XL blocks.
include_biases: optional `bool`, whether or not to include attention biases.
include_state: optional `bool`, whether or not to include state data.
include_mask: optional `bool`, whether or not to include mask data.
include_segment: optional `bool`, whether or not to include segment data.
Returns:
A dictionary with `str` as keys and `Tensor` as values.
"""
encoding_shape = (batch_size, seq_length * 2, hidden_size)
data = dict(
relative_position_encoding=tf.random.normal(shape=encoding_shape),
content_stream=tf.random.normal(
shape=(batch_size, seq_length, hidden_size)))
if include_biases:
attention_bias_shape = (num_heads, head_size)
data.update(dict(
content_attention_bias=tf.random.normal(shape=attention_bias_shape),
segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
positional_attention_bias=tf.random.normal(shape=attention_bias_shape)))
if two_stream:
data.update(dict(
query_stream=tf.random.normal(
shape=(batch_size, num_predictions, hidden_size)),
target_mapping=tf.random.normal(
shape=(batch_size, num_predictions, seq_length))))
if include_state:
total_seq_length = seq_length + memory_length
if num_layers > 1:
state_shape = (num_layers, batch_size, memory_length, hidden_size)
else:
state_shape = (batch_size, memory_length, hidden_size)
data.update(dict(
state=tf.random.normal(shape=state_shape)))
else:
total_seq_length = seq_length
if include_mask:
mask_shape = (batch_size, num_heads, seq_length, total_seq_length)
mask_data = np.random.randint(2, size=mask_shape).astype("float32")
data["content_attention_mask"] = mask_data
if two_stream:
data["query_attention_mask"] = mask_data
if include_segment:
# A transformer XL block takes an individual segment "encoding" from the
# entirety of the Transformer XL segment "embedding".
if num_layers > 1:
segment_encoding_shape = (num_layers, 2, num_heads, head_size)
segment_encoding_name = "segment_embedding"
else:
segment_encoding_shape = (2, num_heads, head_size)
segment_encoding_name = "segment_encoding"
segment_matrix = np.random.randint(
2, size=(batch_size, seq_length, total_seq_length))
data["segment_matrix"] = tf.math.equal(segment_matrix, 1)
data[segment_encoding_name] = tf.random.normal(shape=segment_encoding_shape)
return data
@keras_parameterized.run_all_keras_modes
class TransformerXLBlockTest(keras_parameterized.TestCase):
@combinations.generate(combinations.combine(
memory_length=[0, 4],
two_stream=[True, False],
state=[True, False],
mask=[True, False],
segment=[True, False]))
def test_transformer_xl_block(
self,
two_stream,
memory_length,
state,
mask,
segment):
"""Tests combinations of Transformer XL block calculations."""
batch_size, num_heads, head_size, seq_length = 2, 12, 64, 8
hidden_size, num_predictions, inner_size = 24, 8, 12
data = create_mock_transformer_xl_data(
include_biases=True,
num_heads=num_heads,
head_size=head_size,
hidden_size=hidden_size,
seq_length=seq_length,
batch_size=batch_size,
memory_length=memory_length,
num_predictions=num_predictions,
two_stream=two_stream,
include_state=state,
include_mask=mask,
include_segment=segment)
test_layer = transformer_xl.TransformerXLBlock(
vocab_size=32000,
hidden_size=hidden_size,
num_attention_heads=num_heads,
head_size=head_size,
inner_size=inner_size,
dropout_rate=0.,
attention_dropout_rate=0.,
two_stream=two_stream)
output = test_layer(**data)
content_attention = output["content_attention"]
self.assertEqual(content_attention.shape,
[batch_size, seq_length, hidden_size])
if two_stream:
self.assertIn("query_attention", output)
self.assertEqual(output["query_attention"].shape,
[batch_size, num_predictions, hidden_size])
else:
self.assertNotIn("query_attention", output)
def test_get_config(self):
transformer_xl_block = transformer_xl.TransformerXLBlock(
vocab_size=32000,
head_size=64,
num_attention_heads=2,
hidden_size=10,
inner_size=50,
dropout_rate=0.,
attention_dropout_rate=0.,
two_stream=False)
transformer_xl_block_config = transformer_xl_block.get_config()
new_block = transformer_xl.TransformerXLBlock.from_config(
transformer_xl_block_config)
self.assertEqual(transformer_xl_block_config, new_block.get_config())
@keras_parameterized.run_all_keras_modes
class TransformerXLTest(keras_parameterized.TestCase):
@combinations.generate(combinations.combine(
two_stream=[True, False],
memory_length=[0, 4],
reuse_length=[0, 4],
tie_attention_biases=[True, False],
state=[True, False],
mask=[True, False],
segment=[True, False]))
def test_transformer_xl(
self,
two_stream,
memory_length,
reuse_length,
tie_attention_biases,
state,
mask,
segment):
batch_size, num_heads, head_size, seq_length = 2, 12, 64, 8
hidden_size, num_predictions, inner_size = 24, 8, 12
num_layers = 3
data = create_mock_transformer_xl_data(
include_biases=False,
num_heads=num_heads,
head_size=head_size,
hidden_size=hidden_size,
seq_length=seq_length,
batch_size=batch_size,
memory_length=memory_length,
num_predictions=num_predictions,
two_stream=two_stream,
num_layers=num_layers,
include_state=state,
include_mask=mask,
include_segment=segment)
transformer_xl_layer = transformer_xl.TransformerXL(
vocab_size=32000,
num_layers=num_layers,
head_size=head_size,
hidden_size=hidden_size,
num_attention_heads=num_heads,
inner_size=inner_size,
dropout_rate=0.,
attention_dropout_rate=0.,
initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
two_stream=two_stream,
tie_attention_biases=tie_attention_biases,
memory_length=memory_length,
reuse_length=reuse_length,
inner_activation="relu")
attention_output, cached_memory_states = transformer_xl_layer(**data)
if two_stream:
self.assertEqual(attention_output.shape,
[batch_size, num_predictions, hidden_size])
else:
self.assertEqual(attention_output.shape,
[batch_size, seq_length, hidden_size])
self.assertEqual(len(cached_memory_states), num_layers)
def test_get_config(self):
transformer_xl_layer = transformer_xl.TransformerXL(
vocab_size=32000,
num_layers=12,
hidden_size=36,
head_size=12,
num_attention_heads=12,
inner_size=12,
dropout_rate=0.,
attention_dropout_rate=0.,
initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
two_stream=False,
tie_attention_biases=True,
memory_length=0,
reuse_length=0,
inner_activation="relu")
transformer_xl_config = transformer_xl_layer.get_config()
new_transformer_xl = transformer_xl.TransformerXL.from_config(
transformer_xl_config)
self.assertEqual(transformer_xl_config, new_transformer_xl.get_config())
if __name__ == "__main__":
np.random.seed(0)
tf.random.set_seed(0)
tf.test.main()
......@@ -72,7 +72,11 @@ class BertClassifier(tf.keras.Model):
if use_encoder_pooler:
# Because we have a copy of inputs to create this Model object, we can
# invoke the Network object with its own input tensors to start the Model.
_, cls_output = network(inputs)
outputs = network(inputs)
if isinstance(outputs, list):
cls_output = outputs[1]
else:
cls_output = outputs['pooled_output']
cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
self.classifier = networks.Classification(
......@@ -83,7 +87,11 @@ class BertClassifier(tf.keras.Model):
name='sentence_prediction')
predictions = self.classifier(cls_output)
else:
sequence_output, _ = network(inputs)
outputs = network(inputs)
if isinstance(outputs, list):
sequence_output = outputs[0]
else:
sequence_output = outputs['sequence_output']
self.classifier = layers.ClassificationHead(
inner_dim=sequence_output.shape[-1],
num_classes=num_classes,
......
......@@ -14,10 +14,6 @@
# ==============================================================================
"""Tests for BERT trainer network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import parameterized
import tensorflow as tf
......@@ -31,13 +27,15 @@ from official.nlp.modeling.models import bert_classifier
@keras_parameterized.run_all_keras_modes
class BertClassifierTest(keras_parameterized.TestCase):
@parameterized.parameters(1, 3)
def test_bert_trainer(self, num_classes):
@parameterized.named_parameters(('single_cls', 1, False), ('3_cls', 3, False),
('3_cls_dictoutputs', 3, True))
def test_bert_trainer(self, num_classes, dict_outputs):
"""Validate that the Keras object can be created."""
# Build a transformer network to use within the BERT trainer.
vocab_size = 100
sequence_length = 512
test_network = networks.BertEncoder(vocab_size=vocab_size, num_layers=2)
test_network = networks.BertEncoder(
vocab_size=vocab_size, num_layers=2, dict_outputs=dict_outputs)
# Create a BERT trainer with the created network.
bert_trainer_model = bert_classifier.BertClassifier(
......
......@@ -161,8 +161,9 @@ class BertPretrainerV2(tf.keras.Model):
name: The name of the model.
Inputs: Inputs defined by the encoder network, plus `masked_lm_positions` as a
dictionary.
Outputs: A dictionary of `lm_output` and classification head outputs keyed by
head names.
Outputs: A dictionary of `lm_output`, classification head outputs keyed by
head names, and also outputs from `encoder_network`, keyed by
`pooled_output`, `sequence_output` and `encoder_outputs` (if any).
"""
def __init__(
......@@ -180,17 +181,32 @@ class BertPretrainerV2(tf.keras.Model):
'classification_heads': classification_heads,
'name': name,
}
self.encoder_network = encoder_network
inputs = copy.copy(self.encoder_network.inputs)
sequence_output, _ = self.encoder_network(inputs)
outputs = dict()
encoder_network_outputs = self.encoder_network(inputs)
if isinstance(encoder_network_outputs, list):
outputs['pooled_output'] = encoder_network_outputs[1]
# When `encoder_network` was instantiated with return_all_encoder_outputs
# set to True, `encoder_network_outputs[0]` is a list containing
# all transformer layers' output.
if isinstance(encoder_network_outputs[0], list):
outputs['encoder_outputs'] = encoder_network_outputs[0]
outputs['sequence_output'] = encoder_network_outputs[0][-1]
else:
outputs['sequence_output'] = encoder_network_outputs[0]
elif isinstance(encoder_network_outputs, dict):
outputs = encoder_network_outputs
else:
raise ValueError('encoder_network\'s output should be either a list '
'or a dict, but got %s' % encoder_network_outputs)
sequence_output = outputs['sequence_output']
self.classification_heads = classification_heads or []
if len(set([cls.name for cls in self.classification_heads])) != len(
self.classification_heads):
raise ValueError('Classification heads should have unique names.')
outputs = dict()
self.masked_lm = layers.MaskedLM(
embedding_table=self.encoder_network.get_embedding_table(),
activation=mlm_activation,
......@@ -199,7 +215,7 @@ class BertPretrainerV2(tf.keras.Model):
masked_lm_positions = tf.keras.layers.Input(
shape=(None,), name='masked_lm_positions', dtype=tf.int32)
inputs.append(masked_lm_positions)
outputs['lm_output'] = self.masked_lm(
outputs['mlm_logits'] = self.masked_lm(
sequence_output, masked_positions=masked_lm_positions)
for cls_head in self.classification_heads:
outputs[cls_head.name] = cls_head(sequence_output)
......
......@@ -12,12 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for BERT trainer network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for BERT pretrainer model."""
import itertools
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
......@@ -111,15 +109,24 @@ class BertPretrainerTest(keras_parameterized.TestCase):
self.assertAllEqual(bert_trainer_model.get_config(),
new_bert_trainer_model.get_config())
def test_bert_pretrainerv2(self):
@parameterized.parameters(itertools.product(
(False, True),
(False, True),
))
def test_bert_pretrainerv2(self, dict_outputs, return_all_encoder_outputs):
"""Validate that the Keras object can be created."""
# Build a transformer network to use within the BERT trainer.
vocab_size = 100
sequence_length = 512
hidden_size = 48
num_layers = 2
test_network = networks.BertEncoder(
vocab_size=vocab_size,
num_layers=2,
max_sequence_length=sequence_length)
num_layers=num_layers,
hidden_size=hidden_size,
max_sequence_length=sequence_length,
return_all_encoder_outputs=return_all_encoder_outputs,
dict_outputs=dict_outputs)
# Create a BERT trainer with the created network.
bert_trainer_model = bert_pretrainer.BertPretrainerV2(
......@@ -134,9 +141,28 @@ class BertPretrainerTest(keras_parameterized.TestCase):
# Invoke the trainer model on the inputs. This causes the layer to be built.
outputs = bert_trainer_model([word_ids, mask, type_ids, lm_mask])
has_encoder_outputs = dict_outputs or return_all_encoder_outputs
if has_encoder_outputs:
self.assertSameElements(
outputs.keys(),
['sequence_output', 'pooled_output', 'mlm_logits', 'encoder_outputs'])
self.assertLen(outputs['encoder_outputs'], num_layers)
else:
self.assertSameElements(
outputs.keys(), ['sequence_output', 'pooled_output', 'mlm_logits'])
# Validate that the outputs are of the expected shape.
expected_lm_shape = [None, num_token_predictions, vocab_size]
self.assertAllEqual(expected_lm_shape, outputs['lm_output'].shape.as_list())
self.assertAllEqual(expected_lm_shape,
outputs['mlm_logits'].shape.as_list())
expected_sequence_output_shape = [None, sequence_length, hidden_size]
self.assertAllEqual(expected_sequence_output_shape,
outputs['sequence_output'].shape.as_list())
expected_pooled_output_shape = [None, hidden_size]
self.assertAllEqual(expected_pooled_output_shape,
outputs['pooled_output'].shape.as_list())
def test_v2_serialize_deserialize(self):
"""Validate that the BERT trainer can be serialized and deserialized."""
......
......@@ -64,7 +64,11 @@ class BertSpanLabeler(tf.keras.Model):
# Because we have a copy of inputs to create this Model object, we can
# invoke the Network object with its own input tensors to start the Model.
sequence_output, _ = network(inputs)
outputs = network(inputs)
if isinstance(outputs, list):
sequence_output = outputs[0]
else:
sequence_output = outputs['sequence_output']
# This is an instance variable for ease of access to the underlying task
# network.
......
......@@ -14,10 +14,7 @@
# ==============================================================================
"""Tests for BERT trainer network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
......@@ -30,12 +27,14 @@ from official.nlp.modeling.models import bert_span_labeler
@keras_parameterized.run_all_keras_modes
class BertSpanLabelerTest(keras_parameterized.TestCase):
def test_bert_trainer(self):
@parameterized.parameters(True, False)
def test_bert_trainer(self, dict_outputs):
"""Validate that the Keras object can be created."""
# Build a transformer network to use within the BERT trainer.
vocab_size = 100
sequence_length = 512
test_network = networks.BertEncoder(vocab_size=vocab_size, num_layers=2)
test_network = networks.BertEncoder(
vocab_size=vocab_size, num_layers=2, dict_outputs=dict_outputs)
# Create a BERT trainer with the created network.
bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network)
......
......@@ -67,7 +67,11 @@ class BertTokenClassifier(tf.keras.Model):
# Because we have a copy of inputs to create this Model object, we can
# invoke the Network object with its own input tensors to start the Model.
sequence_output, _ = network(inputs)
outputs = network(inputs)
if isinstance(outputs, list):
sequence_output = outputs[0]
else:
sequence_output = outputs['sequence_output']
sequence_output = tf.keras.layers.Dropout(rate=dropout_rate)(
sequence_output)
......
......@@ -12,12 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for BERT trainer network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for BERT token classifier."""
from absl.testing import parameterized
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
......@@ -30,7 +27,8 @@ from official.nlp.modeling.models import bert_token_classifier
@keras_parameterized.run_all_keras_modes
class BertTokenClassifierTest(keras_parameterized.TestCase):
def test_bert_trainer(self):
@parameterized.parameters(True, False)
def test_bert_trainer(self, dict_outputs):
"""Validate that the Keras object can be created."""
# Build a transformer network to use within the BERT trainer.
vocab_size = 100
......@@ -38,7 +36,8 @@ class BertTokenClassifierTest(keras_parameterized.TestCase):
test_network = networks.BertEncoder(
vocab_size=vocab_size,
num_layers=2,
max_sequence_length=sequence_length)
max_sequence_length=sequence_length,
dict_outputs=dict_outputs)
# Create a BERT trainer with the created network.
num_classes = 3
......
......@@ -14,12 +14,7 @@
# ==============================================================================
"""Trainer network for dual encoder style models."""
# pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
# Import libraries
import tensorflow as tf
from official.nlp.modeling import layers
......@@ -84,11 +79,16 @@ class DualEncoder(tf.keras.Model):
shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
left_inputs = [left_word_ids, left_mask, left_type_ids]
left_sequence_output, left_encoded = network(left_inputs)
left_outputs = network(left_inputs)
if isinstance(left_outputs, list):
left_sequence_output, left_encoded = left_outputs
else:
left_sequence_output = left_outputs['sequence_output']
left_encoded = left_outputs['pooled_output']
if normalize:
left_encoded = tf.keras.layers.Lambda(
lambda x: tf.nn.l2_normalize(x, axis=1))(left_encoded)
lambda x: tf.nn.l2_normalize(x, axis=1))(
left_encoded)
if output == 'logits':
right_word_ids = tf.keras.layers.Input(
......@@ -99,33 +99,40 @@ class DualEncoder(tf.keras.Model):
shape=(max_seq_length,), dtype=tf.int32, name='right_type_ids')
right_inputs = [right_word_ids, right_mask, right_type_ids]
_, right_encoded = network(right_inputs)
right_outputs = network(right_inputs)
if isinstance(right_outputs, list):
_, right_encoded = right_outputs
else:
right_encoded = right_outputs['pooled_output']
if normalize:
right_encoded = tf.keras.layers.Lambda(
lambda x: tf.nn.l2_normalize(x, axis=1))(right_encoded)
dot_products = layers.MatMulWithMargin(logit_scale=logit_scale,
logit_margin=logit_margin,
name='dot_product')
inputs = [left_word_ids, left_mask, left_type_ids, right_word_ids,
right_mask, right_type_ids]
lambda x: tf.nn.l2_normalize(x, axis=1))(
right_encoded)
dot_products = layers.MatMulWithMargin(
logit_scale=logit_scale,
logit_margin=logit_margin,
name='dot_product')
inputs = [
left_word_ids, left_mask, left_type_ids, right_word_ids, right_mask,
right_type_ids
]
left_logits, right_logits = dot_products(left_encoded, right_encoded)
outputs = [left_logits, right_logits]
outputs = dict(left_logits=left_logits, right_logits=right_logits)
elif output == 'predictions':
inputs = [left_word_ids, left_mask, left_type_ids]
# To keep consistent with legacy BERT hub modules, the outputs are
# "pooled_output" and "sequence_output".
outputs = [left_encoded, left_sequence_output]
outputs = dict(
sequence_output=left_sequence_output, pooled_output=left_encoded)
else:
raise ValueError('output type %s is not supported' % output)
super(DualEncoder, self).__init__(
inputs=inputs, outputs=outputs, **kwargs)
super(DualEncoder, self).__init__(inputs=inputs, outputs=outputs, **kwargs)
# Set _self_setattr_tracking to True so it can be exported with assets.
self._self_setattr_tracking = True
......
......@@ -14,11 +14,6 @@
# ==============================================================================
"""Tests for dual encoder network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# Import libraries
from absl.testing import parameterized
import tensorflow as tf
......@@ -42,7 +37,8 @@ class DualEncoderTest(keras_parameterized.TestCase):
vocab_size=vocab_size,
num_layers=2,
hidden_size=hidden_size,
sequence_length=sequence_length)
sequence_length=sequence_length,
dict_outputs=True)
# Create a dual encoder model with the created network.
dual_encoder_model = dual_encoder.DualEncoder(
......@@ -59,21 +55,19 @@ class DualEncoderTest(keras_parameterized.TestCase):
if output == 'logits':
outputs = dual_encoder_model([
left_word_ids, left_mask, left_type_ids,
right_word_ids, right_mask, right_type_ids])
left_encoded, _ = outputs
left_word_ids, left_mask, left_type_ids, right_word_ids, right_mask,
right_type_ids
])
_ = outputs['left_logits']
elif output == 'predictions':
left_encoded, left_sequence_output = dual_encoder_model([
left_word_ids, left_mask, left_type_ids])
outputs = dual_encoder_model([left_word_ids, left_mask, left_type_ids])
# Validate that the outputs are of the expected shape.
expected_encoding_shape = [None, 768]
self.assertAllEqual(expected_encoding_shape, left_encoded.shape.as_list())
expected_sequence_shape = [None, sequence_length, 768]
self.assertAllEqual(expected_sequence_shape,
left_sequence_output.shape.as_list())
outputs['sequence_output'].shape.as_list())
left_encoded = outputs['pooled_output']
expected_encoding_shape = [None, 768]
self.assertAllEqual(expected_encoding_shape, left_encoded.shape.as_list())
@parameterized.parameters((192, 'logits'), (768, 'predictions'))
def test_dual_encoder_tensor_call(self, hidden_size, output):
......
......@@ -14,10 +14,6 @@
# ==============================================================================
"""Trainer network for ELECTRA models."""
# pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import copy
......@@ -139,14 +135,11 @@ class ElectraPretrainer(tf.keras.Model):
masked_lm_positions = inputs['masked_lm_positions']
### Generator ###
sequence_output, cls_output = self.generator_network(
[input_word_ids, input_mask, input_type_ids])
sequence_output = self.generator_network(
[input_word_ids, input_mask, input_type_ids])['sequence_output']
# The generator encoder network may get outputs from all layers.
if isinstance(sequence_output, list):
sequence_output = sequence_output[-1]
if isinstance(cls_output, list):
cls_output = cls_output[-1]
lm_outputs = self.masked_lm(sequence_output, masked_lm_positions)
sentence_outputs = self.classification(sequence_output)
......@@ -157,10 +150,10 @@ class ElectraPretrainer(tf.keras.Model):
### Discriminator ###
disc_input = fake_data['inputs']
disc_label = fake_data['is_fake_tokens']
disc_sequence_output, _ = self.discriminator_network([
disc_sequence_output = self.discriminator_network([
disc_input['input_word_ids'], disc_input['input_mask'],
disc_input['input_type_ids']
])
])['sequence_output']
# The discriminator encoder network may get outputs from all layers.
if isinstance(disc_sequence_output, list):
......
......@@ -14,10 +14,6 @@
# ==============================================================================
"""Tests for ELECTRA pre trainer network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
......@@ -38,11 +34,13 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
test_generator_network = networks.BertEncoder(
vocab_size=vocab_size,
num_layers=2,
max_sequence_length=sequence_length)
max_sequence_length=sequence_length,
dict_outputs=True)
test_discriminator_network = networks.BertEncoder(
vocab_size=vocab_size,
num_layers=2,
max_sequence_length=sequence_length)
max_sequence_length=sequence_length,
dict_outputs=True)
# Create a ELECTRA trainer with the created network.
num_classes = 3
......@@ -92,9 +90,9 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
# Build a transformer network to use within the ELECTRA trainer. (Here, we
# use a short sequence_length for convenience.)
test_generator_network = networks.BertEncoder(
vocab_size=100, num_layers=4, max_sequence_length=3)
vocab_size=100, num_layers=4, max_sequence_length=3, dict_outputs=True)
test_discriminator_network = networks.BertEncoder(
vocab_size=100, num_layers=4, max_sequence_length=3)
vocab_size=100, num_layers=4, max_sequence_length=3, dict_outputs=True)
# Create a ELECTRA trainer with the created network.
eletrca_trainer_model = electra_pretrainer.ElectraPretrainer(
......
......@@ -142,12 +142,12 @@ class Seq2SeqTransformer(tf.keras.Model):
self._beam_size = beam_size
self._alpha = alpha
self._dtype = dtype
self.embedding_lookup = layers.OnDeviceEmbedding(
self.embedding_lookup = keras_nlp.layers.OnDeviceEmbedding(
vocab_size=self._vocab_size,
embedding_width=self._embedding_width,
initializer=tf.random_normal_initializer(
mean=0., stddev=self._embedding_width**-0.5),
use_scale=True)
scale_factor=self._embedding_width**0.5)
self.encoder_layer = encoder_layer
self.decoder_layer = decoder_layer
self.position_embedding = layers.RelativePositionEmbedding(
......@@ -472,7 +472,7 @@ class TransformerEncoder(tf.keras.layers.Layer):
self.encoder_layers = []
for i in range(self.num_layers):
self.encoder_layers.append(
keras_nlp.TransformerEncoderBlock(
keras_nlp.layers.TransformerEncoderBlock(
num_attention_heads=self.num_attention_heads,
inner_dim=self._intermediate_size,
inner_activation=self._activation,
......@@ -581,7 +581,7 @@ class TransformerDecoder(tf.keras.layers.Layer):
self.decoder_layers = []
for i in range(self.num_layers):
self.decoder_layers.append(
layers.TransformerDecoderLayer(
layers.TransformerDecoderBlock(
num_attention_heads=self.num_attention_heads,
intermediate_size=self._intermediate_size,
intermediate_activation=self._activation,
......
......@@ -10,7 +10,7 @@ Transformer-based encoder as described in ["BERT: Pre-training of Deep
Bidirectional Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding lookups,
transformer layers and pooling layer.
* [`AlbertTransformerEncoder`](albert_transformer_encoder.py) implements a
* [`AlbertEncoder`](albert_encoder.py) implements a
Transformer-encoder described in the paper ["ALBERT: A Lite BERT for
Self-supervised Learning of Language Representations"]
(https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805), ALBERT refactorizes embedding parameters
......@@ -26,3 +26,4 @@ to 1) head.
* [`SpanLabeling`](span_labeling.py) implements a single-span labeler (that is, a prediction head that can predict one start and end index per batch item) based on a single dense hidden layer. It can be used in the SQuAD task.
* [`XLNetBase`](xlnet_base.py) implements the base network used in "XLNet: Generalized Autoregressive Pretraining for Language Understanding"(https://arxiv.org/abs/1906.08237). It includes embedding lookups, relative position encodings, mask computations, segment matrix computations and Transformer XL layers using one or two stream relative self-attention.
......@@ -13,11 +13,12 @@
# limitations under the License.
# ==============================================================================
"""Networks package definition."""
from official.nlp.modeling.networks.albert_transformer_encoder import AlbertTransformerEncoder
from official.nlp.modeling.networks.albert_encoder import AlbertEncoder
from official.nlp.modeling.networks.bert_encoder import BertEncoder
from official.nlp.modeling.networks.classification import Classification
from official.nlp.modeling.networks.encoder_scaffold import EncoderScaffold
from official.nlp.modeling.networks.mobile_bert_encoder import MobileBERTEncoder
from official.nlp.modeling.networks.span_labeling import SpanLabeling
from official.nlp.modeling.networks.xlnet_base import XLNetBase
# Backward compatibility. The modules are deprecated.
TransformerEncoder = BertEncoder
......@@ -14,10 +14,6 @@
# ==============================================================================
"""ALBERT (https://arxiv.org/abs/1810.04805) text encoder network."""
# pylint: disable=g-classes-have-attributes
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import tensorflow as tf
......@@ -27,7 +23,7 @@ from official.nlp.modeling import layers
@tf.keras.utils.register_keras_serializable(package='Text')
class AlbertTransformerEncoder(tf.keras.Model):
class AlbertEncoder(tf.keras.Model):
"""ALBERT (https://arxiv.org/abs/1810.04805) text encoder network.
This network implements the encoder described in the paper "ALBERT: A Lite
......@@ -64,6 +60,7 @@ class AlbertTransformerEncoder(tf.keras.Model):
attention_dropout_rate: The dropout rate to use for the attention layers
within the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
dict_outputs: Whether to use a dictionary as the model outputs.
"""
def __init__(self,
......@@ -79,6 +76,7 @@ class AlbertTransformerEncoder(tf.keras.Model):
dropout_rate=0.1,
attention_dropout_rate=0.1,
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
dict_outputs=False,
**kwargs):
activation = tf.keras.activations.get(activation)
initializer = tf.keras.initializers.get(initializer)
......@@ -116,7 +114,7 @@ class AlbertTransformerEncoder(tf.keras.Model):
word_embeddings = self._embedding_layer(word_ids)
# Always uses dynamic slicing for simplicity.
self._position_embedding_layer = keras_nlp.PositionEmbedding(
self._position_embedding_layer = keras_nlp.layers.PositionEmbedding(
initializer=initializer,
max_length=max_sequence_length,
name='position_embedding')
......@@ -152,7 +150,7 @@ class AlbertTransformerEncoder(tf.keras.Model):
data = embeddings
attention_mask = layers.SelfAttentionMask()([data, mask])
shared_layer = keras_nlp.TransformerEncoderBlock(
shared_layer = keras_nlp.layers.TransformerEncoderBlock(
num_attention_heads=num_attention_heads,
inner_dim=intermediate_size,
inner_activation=activation,
......@@ -160,8 +158,10 @@ class AlbertTransformerEncoder(tf.keras.Model):
attention_dropout=attention_dropout_rate,
kernel_initializer=initializer,
name='transformer')
encoder_outputs = []
for _ in range(num_layers):
data = shared_layer([data, attention_mask])
encoder_outputs.append(data)
first_token_tensor = (
tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data)
......@@ -172,9 +172,17 @@ class AlbertTransformerEncoder(tf.keras.Model):
kernel_initializer=initializer,
name='pooler_transform')(
first_token_tensor)
super(AlbertTransformerEncoder, self).__init__(
inputs=[word_ids, mask, type_ids], outputs=[data, cls_output], **kwargs)
if dict_outputs:
outputs = dict(
sequence_output=data,
encoder_outputs=encoder_outputs,
pooled_output=cls_output,
)
else:
outputs = [data, cls_output]
super(AlbertEncoder, self).__init__(
inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
def get_embedding_table(self):
return self._embedding_layer.embeddings
......
......@@ -23,16 +23,16 @@ import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.networks import albert_transformer_encoder
from official.nlp.modeling.networks import albert_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
class AlbertEncoderTest(keras_parameterized.TestCase):
def tearDown(self):
super(AlbertTransformerEncoderTest, self).tearDown()
super(AlbertEncoderTest, self).tearDown()
tf.keras.mixed_precision.experimental.set_policy("float32")
@parameterized.named_parameters(
......@@ -52,7 +52,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
# Create a small TransformerEncoder for testing.
test_network = albert_transformer_encoder.AlbertTransformerEncoder(**kwargs)
test_network = albert_encoder.AlbertEncoder(**kwargs)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
......@@ -84,13 +84,14 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
sequence_length = 21
vocab_size = 57
num_types = 7
num_layers = 3
# Create a small TransformerEncoder for testing.
test_network = albert_transformer_encoder.AlbertTransformerEncoder(
test_network = albert_encoder.AlbertEncoder(
vocab_size=vocab_size,
embedding_width=8,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
num_layers=num_layers,
type_vocab_size=num_types)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
......@@ -109,21 +110,43 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length))
_ = model.predict([word_id_data, mask_data, type_id_data])
list_outputs = model.predict([word_id_data, mask_data, type_id_data])
# Creates a TransformerEncoder with max_sequence_length != sequence_length
max_sequence_length = 128
test_network = albert_transformer_encoder.AlbertTransformerEncoder(
test_network = albert_encoder.AlbertEncoder(
vocab_size=vocab_size,
embedding_width=8,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=3,
num_layers=num_layers,
type_vocab_size=num_types)
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
_ = model.predict([word_id_data, mask_data, type_id_data])
# Tests dictionary outputs.
test_network_dict = albert_encoder.AlbertEncoder(
vocab_size=vocab_size,
embedding_width=8,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=num_layers,
type_vocab_size=num_types,
dict_outputs=True)
_ = test_network_dict([word_ids, mask, type_ids])
test_network_dict.set_weights(test_network.get_weights())
list_outputs = test_network([word_id_data, mask_data, type_id_data])
dict_outputs = test_network_dict(
dict(
input_word_ids=word_id_data,
input_mask=mask_data,
input_type_ids=type_id_data))
self.assertAllEqual(list_outputs[0], dict_outputs["sequence_output"])
self.assertAllEqual(list_outputs[1], dict_outputs["pooled_output"])
self.assertLen(dict_outputs["pooled_output"], num_layers)
def test_serialize_deserialize(self):
tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
# Create a network object that sets all of its config options.
......@@ -140,7 +163,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
dropout_rate=0.05,
attention_dropout_rate=0.22,
initializer="glorot_uniform")
network = albert_transformer_encoder.AlbertTransformerEncoder(**kwargs)
network = albert_encoder.AlbertEncoder(**kwargs)
expected_config = dict(kwargs)
expected_config["activation"] = tf.keras.activations.serialize(
......@@ -151,7 +174,7 @@ class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
# Create another network object from the first object's config.
new_network = (
albert_transformer_encoder.AlbertTransformerEncoder.from_config(
albert_encoder.AlbertEncoder.from_config(
network.get_config()))
# Validate that the config can be forced to JSON.
......
......@@ -19,11 +19,13 @@ import tensorflow as tf
from official.modeling import activations
from official.nlp import keras_nlp
from official.nlp.modeling import layers
# This class is being replaced by keras_nlp.encoders.BertEncoder and merely
# acts as a wrapper if you need: 1) list outputs instead of dict outputs,
# 2) shared embedding layer.
@tf.keras.utils.register_keras_serializable(package='Text')
class BertEncoder(tf.keras.Model):
class BertEncoder(keras_nlp.encoders.BertEncoder):
"""Bi-directional Transformer-based encoder network.
This network implements a bi-directional Transformer-based encoder as
......@@ -58,7 +60,9 @@ class BertEncoder(tf.keras.Model):
within the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
return_all_encoder_outputs: Whether to output sequence embedding outputs of
all encoder transformer layers.
all encoder transformer layers. Note: when the following `dict_outputs`
argument is True, all encoder outputs are always returned in the dict,
keyed by `encoder_outputs`.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yeilds the full
......@@ -72,6 +76,7 @@ class BertEncoder(tf.keras.Model):
embedding layer. Otherwise, we will reuse the given embedding layer. This
parameter is originally added for ELECTRA model which needs to tie the
generator embeddings with the discriminator embeddings.
dict_outputs: Whether to use a dictionary as the model outputs.
"""
def __init__(self,
......@@ -91,140 +96,53 @@ class BertEncoder(tf.keras.Model):
output_range=None,
embedding_width=None,
embedding_layer=None,
dict_outputs=False,
**kwargs):
activation = tf.keras.activations.get(activation)
initializer = tf.keras.initializers.get(initializer)
self._self_setattr_tracking = False
self._config_dict = {
'vocab_size': vocab_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'num_attention_heads': num_attention_heads,
'max_sequence_length': max_sequence_length,
'type_vocab_size': type_vocab_size,
'intermediate_size': intermediate_size,
'activation': tf.keras.activations.serialize(activation),
'dropout_rate': dropout_rate,
'attention_dropout_rate': attention_dropout_rate,
'initializer': tf.keras.initializers.serialize(initializer),
'return_all_encoder_outputs': return_all_encoder_outputs,
'output_range': output_range,
'embedding_width': embedding_width,
}
self._embedding_layer_instance = embedding_layer
word_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_word_ids')
mask = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_mask')
type_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_type_ids')
if embedding_width is None:
embedding_width = hidden_size
if embedding_layer is None:
self._embedding_layer = layers.OnDeviceEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
initializer=initializer,
name='word_embeddings')
else:
self._embedding_layer = embedding_layer
word_embeddings = self._embedding_layer(word_ids)
# Always uses dynamic slicing for simplicity.
self._position_embedding_layer = keras_nlp.PositionEmbedding(
initializer=initializer,
max_length=max_sequence_length,
name='position_embedding')
position_embeddings = self._position_embedding_layer(word_embeddings)
self._type_embedding_layer = layers.OnDeviceEmbedding(
vocab_size=type_vocab_size,
embedding_width=embedding_width,
super(BertEncoder, self).__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_layers=num_layers,
num_attention_heads=num_attention_heads,
max_sequence_length=max_sequence_length,
type_vocab_size=type_vocab_size,
inner_dim=intermediate_size,
inner_activation=activation,
output_dropout=dropout_rate,
attention_dropout=attention_dropout_rate,
initializer=initializer,
use_one_hot=True,
name='type_embeddings')
type_embeddings = self._type_embedding_layer(type_ids)
embeddings = tf.keras.layers.Add()(
[word_embeddings, position_embeddings, type_embeddings])
self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
embeddings = self._embedding_norm_layer(embeddings)
embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
if embedding_width != hidden_size:
self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
'...x,xy->...y',
output_shape=hidden_size,
bias_axes='y',
kernel_initializer=initializer,
name='embedding_projection')
embeddings = self._embedding_projection(embeddings)
self._transformer_layers = []
data = embeddings
attention_mask = layers.SelfAttentionMask()([data, mask])
encoder_outputs = []
for i in range(num_layers):
if i == num_layers - 1 and output_range is not None:
transformer_output_range = output_range
output_range=output_range,
embedding_width=embedding_width)
# Replace arguments from keras_nlp.encoders.BertEncoder.
self._config_dict['activation'] = self._config_dict.pop('inner_activation')
self._config_dict['intermediate_size'] = self._config_dict.pop('inner_dim')
self._config_dict['dropout_rate'] = self._config_dict.pop('output_dropout')
self._config_dict['attention_dropout_rate'] = self._config_dict.pop(
'attention_dropout')
self._config_dict['dict_outputs'] = dict_outputs
self._config_dict['return_all_encoder_outputs'] = return_all_encoder_outputs
if dict_outputs:
return
else:
nested_output = self._nested_outputs
cls_output = nested_output['pooled_output']
if return_all_encoder_outputs:
encoder_outputs = nested_output['encoder_outputs']
outputs = [encoder_outputs, cls_output]
else:
transformer_output_range = None
layer = keras_nlp.layers.TransformerEncoderBlock(
num_attention_heads=num_attention_heads,
inner_dim=intermediate_size,
inner_activation=activation,
output_dropout=dropout_rate,
attention_dropout=attention_dropout_rate,
output_range=transformer_output_range,
kernel_initializer=initializer,
name='transformer/layer_%d' % i)
self._transformer_layers.append(layer)
data = layer([data, attention_mask])
encoder_outputs.append(data)
first_token_tensor = (
tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
encoder_outputs[-1]))
self._pooler_layer = tf.keras.layers.Dense(
units=hidden_size,
activation='tanh',
kernel_initializer=initializer,
name='pooler_transform')
cls_output = self._pooler_layer(first_token_tensor)
if return_all_encoder_outputs:
outputs = [encoder_outputs, cls_output]
sequence_output = nested_output['sequence_output']
outputs = [sequence_output, cls_output]
super(keras_nlp.encoders.BertEncoder, self).__init__(
inputs=self.inputs, outputs=outputs, **kwargs)
# Override method for shared embedding use case.
def _build_embedding_layer(self):
if self._embedding_layer_instance is None:
return super(BertEncoder, self)._build_embedding_layer()
else:
outputs = [encoder_outputs[-1], cls_output]
super(BertEncoder, self).__init__(
inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
def get_embedding_table(self):
return self._embedding_layer.embeddings
def get_embedding_layer(self):
return self._embedding_layer
def get_config(self):
return self._config_dict
@property
def transformer_layers(self):
"""List of Transformer layers in the encoder."""
return self._transformer_layers
@property
def pooler_layer(self):
"""The pooler dense layer after the transformer layers."""
return self._pooler_layer
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
return self._embedding_layer_instance
......@@ -12,11 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for transformer-based text encoder network."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Tests for transformer-based bert encoder network."""
# Import libraries
from absl.testing import parameterized
......@@ -64,6 +60,35 @@ class BertEncoderTest(keras_parameterized.TestCase):
self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float32, pooled.dtype)
test_network_dict = bert_encoder.BertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
dict_outputs=True)
# Create the inputs (note that the first dimension is implicit).
inputs = dict(
input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids)
_ = test_network_dict(inputs)
test_network_dict.set_weights(test_network.get_weights())
batch_size = 2
vocab_size = 100
num_types = 2
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length))
list_outputs = test_network([word_id_data, mask_data, type_id_data])
dict_outputs = test_network_dict(
dict(
input_word_ids=word_id_data,
input_mask=mask_data,
input_type_ids=type_id_data))
self.assertAllEqual(list_outputs[0], dict_outputs["sequence_output"])
self.assertAllEqual(list_outputs[1], dict_outputs["pooled_output"])
def test_all_encoder_outputs_network_creation(self):
hidden_size = 32
sequence_length = 21
......@@ -199,7 +224,8 @@ class BertEncoderTest(keras_parameterized.TestCase):
initializer="glorot_uniform",
return_all_encoder_outputs=False,
output_range=-1,
embedding_width=16)
embedding_width=16,
dict_outputs=True)
network = bert_encoder.BertEncoder(**kwargs)
expected_config = dict(kwargs)
expected_config["activation"] = tf.keras.activations.serialize(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment