Commit 32e4ca51 authored by qianyj's avatar qianyj
Browse files

Update code to v2.11.0

parents 9485aa1d 71060f67
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -22,6 +22,7 @@ from absl import logging
import gin
import tensorflow as tf
from official.modeling import tf_utils
from official.nlp.modeling import layers
from official.nlp.modeling import networks
......@@ -102,7 +103,7 @@ class BertPretrainer(tf.keras.Model):
masked_lm = layers.MaskedLM(
embedding_table=embedding_table,
activation=activation,
initializer=initializer,
initializer=tf_utils.clone_initializer(initializer),
output=output,
name='cls/predictions')
lm_outputs = masked_lm(
......@@ -111,7 +112,7 @@ class BertPretrainer(tf.keras.Model):
classification = networks.Classification(
input_width=cls_output.shape[-1],
num_classes=num_classes,
initializer=initializer,
initializer=tf_utils.clone_initializer(initializer),
output=output,
name='classification')
sentence_outputs = classification(cls_output)
......@@ -199,6 +200,7 @@ class BertPretrainerV2(tf.keras.Model):
self._config = {
'encoder_network': encoder_network,
'mlm_initializer': mlm_initializer,
'mlm_activation': mlm_activation,
'classification_heads': classification_heads,
'name': name,
}
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -96,21 +96,22 @@ class ElectraPretrainer(tf.keras.Model):
self.masked_lm = layers.MaskedLM(
embedding_table=generator_network.get_embedding_table(),
activation=mlm_activation,
initializer=mlm_initializer,
initializer=tf_utils.clone_initializer(mlm_initializer),
output=output_type,
name='generator_masked_lm')
self.classification = layers.ClassificationHead(
inner_dim=generator_network.get_config()['hidden_size'],
num_classes=num_classes,
initializer=mlm_initializer,
initializer=tf_utils.clone_initializer(mlm_initializer),
name='generator_classification_head')
self.discriminator_projection = tf.keras.layers.Dense(
units=discriminator_network.get_config()['hidden_size'],
activation=mlm_activation,
kernel_initializer=mlm_initializer,
kernel_initializer=tf_utils.clone_initializer(mlm_initializer),
name='discriminator_projection_head')
self.discriminator_head = tf.keras.layers.Dense(
units=1, kernel_initializer=mlm_initializer)
units=1,
kernel_initializer=tf_utils.clone_initializer(mlm_initializer))
def call(self, inputs):
"""ELECTRA forward pass.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -55,6 +55,7 @@ class Module(tf.Module):
initializer: Initializer,
dtype: tf.DType = tf.float32,
**kwargs):
initializer = tf_utils.clone_initializer(initializer)
return tf.Variable(initializer(shape, dtype=dtype, **kwargs), name=name)
def read_variable(self,
......@@ -588,7 +589,8 @@ class MultiHeadAttention(Module):
init_std_rescaling = tf.math.sqrt(tf.cast(self.d_kv, dtype=self.dtype))
query_w_init = (
lambda *args, **kwargs: ( # pylint: disable=g-long-lambda
weight_initializer(*args, **kwargs) / init_std_rescaling))
tf_utils.clone_initializer(weight_initializer)(
*args, **kwargs) / init_std_rescaling))
self.q = Linear3D(
self.d_model,
self.d_kv,
......@@ -1004,6 +1006,7 @@ class T5TransformerParams:
num_heads: int
d_ff: int
vocab_size: int
target_vocab_size: Optional[int] = None
dropout_rate: float = 0.0
layer_norm_epsilon: float = 1e-6
shared_embedding: bool = False
......@@ -1020,6 +1023,9 @@ class T5TransformerParams:
num_decoder_layers: Optional[int] = None
one_hot_embedding: bool = True
layer_sharing: bool = False
# If true, uses one relative embedding for all encoder layers and one for all
# decoder layers. Otherwise, have relative embedding for each layer.
use_shared_relative_position_bias: bool = True
class Encoder(Module):
......@@ -1048,6 +1054,7 @@ class Encoder(Module):
self.input_embed = shared_embedding
# Creates an alias to the input embed for encoder-only models.
self.word_embed = self.input_embed
if config.use_shared_relative_position_bias:
self.relative_embedding = RelativePositionEmbedding(
num_heads=self.config.num_heads,
relative_attention_num_buckets=self.config
......@@ -1059,6 +1066,22 @@ class Encoder(Module):
dtype=self.dtype,
compute_dtype=self.compute_dtype,
name="relative_posemb")
else:
self.relative_embeddings = []
for layer_idx in range(self.config.num_layers):
relative_embedding = RelativePositionEmbedding(
num_heads=self.config.num_heads,
relative_attention_num_buckets=self.config
.relative_attention_num_buckets,
relative_attention_max_distance=self.config
.relative_attention_max_distance,
bidirectional=self.config.bidirectional,
embeddings_initializer=self.config
.relative_embeddings_initializer,
dtype=self.dtype,
compute_dtype=self.compute_dtype,
name=f"relative_posemb_{layer_idx}")
self.relative_embeddings.append(relative_embedding)
self.input_dropout = Dropout(self.config.dropout_rate,)
self.encoder_layers = []
for layer_idx in range(self.config.num_layers):
......@@ -1086,12 +1109,38 @@ class Encoder(Module):
self.output_dropout = Dropout(self.config.dropout_rate,)
@tf.Module.with_name_scope
def __call__(self, inputs, encoder_mask=None, training=False):
def get_relpos_bias(self,
input_length: int,
dense_inputs: tf.Tensor,
layer_idx: Optional[int] = None) -> tf.Tensor:
if self.config.use_shared_relative_position_bias:
position_bias = self.relative_embedding(input_length, input_length)
else:
position_bias = self.relative_embeddings[layer_idx](input_length,
input_length)
if dense_inputs is not None:
# Here we ignore relative position bias for dense embeddings.
# TODO(yejiayu): If we proceed to video use cases, rework this part.
dense_input_length = tf_utils.get_shape_list(dense_inputs)[1]
# Position bias shape: [batch, 1, len, len]
paddings = tf.constant([[0, 0], [0, 0], [0, dense_input_length],
[0, dense_input_length]])
position_bias = tf.pad(position_bias, paddings, "CONSTANT")
return position_bias
@tf.Module.with_name_scope
def __call__(self,
inputs=None,
encoder_mask=None,
dense_inputs=None,
training=False):
"""Applies Transformer model on the inputs.
Args:
inputs: input data
inputs: input word ids. Optional if dense data are provided.
encoder_mask: the encoder self-attention mask.
dense_inputs: dense input data. Concat after the embedding if word ids
are provided.
training: whether it is training pass, affecting dropouts.
Returns:
......@@ -1101,14 +1150,26 @@ class Encoder(Module):
if encoder_mask is not None:
encoder_mask = tf.cast(encoder_mask, self.compute_dtype)
cfg = self.config
x = self.input_embed(inputs, one_hot=cfg.one_hot_embedding)
inputs_array = []
if inputs is not None:
inputs_array.append(
self.input_embed(inputs, one_hot=cfg.one_hot_embedding))
if dense_inputs is not None:
inputs_array.append(dense_inputs)
if not inputs_array:
raise ValueError("At least one of inputs and dense_inputs must not be "
"None.")
x = tf.concat(inputs_array, axis=1)
tensor_shape = tf_utils.get_shape_list(x)
tensor_shape[-2] = 1
x = self.input_dropout(x, noise_shape=tensor_shape, training=training)
if inputs is not None:
input_length = tf_utils.get_shape_list(inputs)[1]
position_bias = self.relative_embedding(input_length, input_length)
else:
input_length = 0
for i in range(cfg.num_layers):
position_bias = self.get_relpos_bias(input_length, dense_inputs, i)
x = self.encoder_layers[i](
x,
attention_mask=encoder_mask,
......@@ -1133,11 +1194,15 @@ class Decoder(Module):
self.compute_dtype = compute_dtype
if self.config.num_decoder_layers is None:
self.config.num_decoder_layers = self.config.num_layers
if not hasattr(
self.config,
"target_vocab_size") or self.config.target_vocab_size is None:
self.config.target_vocab_size = self.config.vocab_size
with self.name_scope:
# Target Embedding.
if shared_embedding is None:
self.target_embed = Embed(
vocab_size=self.config.vocab_size,
vocab_size=self.config.target_vocab_size,
features=self.config.d_model,
embeddings_initializer=self.config.vocab_embeddings_initializer,
dtype=self.dtype,
......@@ -1147,6 +1212,7 @@ class Decoder(Module):
self.target_embed = shared_embedding
self.target_dropout = Dropout(self.config.dropout_rate,)
# Position bias for the target self attention.
if config.use_shared_relative_position_bias:
self.relative_embedding = RelativePositionEmbedding(
num_heads=self.config.num_heads,
relative_attention_num_buckets=self.config
......@@ -1158,6 +1224,22 @@ class Decoder(Module):
dtype=self.dtype,
compute_dtype=self.compute_dtype,
name="relative_posemb")
else:
self.relative_embeddings = []
for layer_idx in range(self.config.num_decoder_layers):
relative_embedding = RelativePositionEmbedding(
num_heads=self.config.num_heads,
relative_attention_num_buckets=self.config
.relative_attention_num_buckets,
relative_attention_max_distance=self.config
.relative_attention_max_distance,
bidirectional=self.config.bidirectional,
embeddings_initializer=self.config
.relative_embeddings_initializer,
dtype=self.dtype,
compute_dtype=self.compute_dtype,
name=f"relative_posemb_{layer_idx}")
self.relative_embeddings.append(relative_embedding)
self.decoder_layers = []
for layer_idx in range(self.config.num_decoder_layers):
if self.config.layer_sharing and layer_idx > 0:
......@@ -1185,11 +1267,18 @@ class Decoder(Module):
if not self.config.logits_via_embedding:
self.logits_dense = Linear(
in_features=self.config.d_model,
out_features=self.config.vocab_size,
out_features=self.config.target_vocab_size,
use_bias=False,
dtype=self.dtype,
name="logits")
@tf.Module.with_name_scope
def get_relpos_bias(self, input_length: int, layer_idx: int) -> tf.Tensor:
if self.config.use_shared_relative_position_bias:
return self.relative_embedding(input_length, input_length)
else:
return self.relative_embeddings[layer_idx](input_length, input_length)
@tf.Module.with_name_scope
def __call__(self,
decoder_input_tokens,
......@@ -1208,7 +1297,7 @@ class Decoder(Module):
encoded: the encoder outputs.
decoder_mask: the decoder self-attention mask.
encoder_decoder_mask: the cross-attention mask.
decode: Whether to perform autoaggressive decoding.
decode: Whether to perform autoregressive decoding.
decode_position: integer, the position to decode.
cache: The cache dictionary of key, value tensors.
max_decode_len: An optional integer specifying the maximum decoding
......@@ -1217,7 +1306,10 @@ class Decoder(Module):
training: Whether it is training pass, affecting dropouts.
Returns:
output of a transformer encoder.
output of a transformer encoder including
1. logits: Logits for each word in the vocab.
2. raw_logits: Logits along the moded dimension.
3. cache: Used for decoding in inference mode.
"""
cfg = self.config
# Casts inputs to the dtype.
......@@ -1230,12 +1322,14 @@ class Decoder(Module):
tensor_shape = tf_utils.get_shape_list(x)
tensor_shape[-2] = 1
x = self.target_dropout(x, noise_shape=tensor_shape, training=training)
for i in range(cfg.num_decoder_layers):
if cache is not None:
position_bias = self.relative_embedding(max_decode_len, max_decode_len)
position_bias = self.get_relpos_bias(max_decode_len, i)
else:
input_length = tf_utils.get_shape_list(decoder_input_tokens)[1]
position_bias = self.relative_embedding(input_length, input_length)
for i in range(cfg.num_decoder_layers):
position_bias = self.get_relpos_bias(input_length, i)
if cache is None:
x, _ = self.decoder_layers[i](
x,
......@@ -1265,7 +1359,7 @@ class Decoder(Module):
logits = logits / math.sqrt(cfg.d_model)
else:
logits = self.logits_dense(output)
return logits, cache
return dict(logits=logits, cache=cache, raw_logits=output)
class T5Transformer(Module):
......@@ -1306,33 +1400,72 @@ class T5Transformer(Module):
compute_dtype=self.compute_dtype)
def encode(self,
encoder_input_tokens,
encoder_input_tokens=None,
encoder_segment_ids=None,
encoder_dense_inputs=None,
encoder_dense_segment_ids=None,
training=False):
eligible_positions = tf.cast(
tf.not_equal(encoder_input_tokens, 0), self.compute_dtype)
eligible_position_array = []
if encoder_input_tokens is not None:
eligible_position_array.append(
tf.cast(tf.not_equal(encoder_input_tokens, 0), self.compute_dtype))
if encoder_dense_inputs is not None:
eligible_dense_positions = tf.cast(
tf.reduce_any(tf.not_equal(encoder_dense_inputs, 0), axis=-1),
self.compute_dtype)
eligible_position_array.append(eligible_dense_positions)
if not eligible_position_array:
raise ValueError("At least one of encoder_input_tokens and"
" encoder_dense_inputs must be provided.")
eligible_positions = tf.concat(eligible_position_array, axis=1)
encoder_mask = make_attention_mask(
eligible_positions, eligible_positions, dtype=tf.bool)
encoder_segment_id_array = []
if encoder_segment_ids is not None:
encoder_segment_id_array.append(encoder_segment_ids)
if encoder_dense_segment_ids is not None:
encoder_segment_id_array.append(encoder_dense_segment_ids)
if encoder_segment_id_array:
encoder_segment_ids = tf.concat(encoder_segment_id_array, axis=1)
segment_mask = make_attention_mask(
encoder_segment_ids, encoder_segment_ids, tf.equal, dtype=tf.bool)
encoder_mask = tf.math.logical_and(encoder_mask, segment_mask)
encoder_mask = (1.0 - tf.cast(encoder_mask, self.compute_dtype)) * -1e9
return self.encoder(encoder_input_tokens, encoder_mask, training=training)
return self.encoder(
encoder_input_tokens,
encoder_mask,
encoder_dense_inputs,
training=training)
def decode(
self,
encoded,
decoder_target_tokens,
encoder_input_tokens, # only used for masks
encoder_input_tokens=None, # only used for masks
encoder_dense_inputs=None,
decoder_input_tokens=None,
encoder_segment_ids=None,
encoder_dense_segment_ids=None,
decoder_segment_ids=None,
decode_position=None,
cache=None,
max_decode_len=None,
decode=False,
training=False):
training=False) -> Dict[str, tf.Tensor]:
eligible_inputs_array = []
if encoder_input_tokens is not None:
eligible_inputs = tf.cast(
tf.not_equal(encoder_input_tokens, 0), self.compute_dtype)
eligible_inputs_array.append(eligible_inputs)
if encoder_dense_inputs is not None:
eligible_dense_inputs = tf.cast(
tf.reduce_any(tf.not_equal(encoder_dense_inputs, 0), axis=-1),
self.compute_dtype)
eligible_inputs_array.append(eligible_dense_inputs)
eligible_inputs = tf.concat(eligible_inputs_array, axis=1)
if decode:
# For decoding, the decoder_input_tokens is the decoder_target_tokens.
decoder_input_tokens = decoder_target_tokens
......@@ -1342,14 +1475,12 @@ class T5Transformer(Module):
tf.cast(
tf.not_equal(tf.ones_like(decoder_target_tokens), 0),
self.compute_dtype),
tf.cast(tf.not_equal(encoder_input_tokens, 0), self.compute_dtype),
eligible_inputs,
dtype=tf.bool)
else:
# Note that, masks should be created using decoder_target_tokens.
eligible_targets = tf.cast(
tf.not_equal(decoder_target_tokens, 0), self.compute_dtype)
eligible_inputs = tf.cast(
tf.not_equal(encoder_input_tokens, 0), self.compute_dtype)
decoder_mask = tf.math.logical_and(
make_attention_mask(
eligible_targets, eligible_targets, dtype=tf.bool),
......@@ -1365,6 +1496,9 @@ class T5Transformer(Module):
decoder_segment_ids,
tf.equal,
dtype=tf.bool))
if encoder_dense_segment_ids is not None:
encoder_segment_ids = tf.concat(
[encoder_segment_ids, encoder_dense_segment_ids], axis=1)
encoder_decoder_mask = tf.math.logical_and(
encoder_decoder_mask,
make_attention_mask(
......@@ -1376,7 +1510,7 @@ class T5Transformer(Module):
decoder_mask = (1.0 - tf.cast(decoder_mask, self.compute_dtype)) * -1e9
encoder_decoder_mask = (
1.0 - tf.cast(encoder_decoder_mask, self.compute_dtype)) * -1e9
logits, cache = self.decoder(
outputs = self.decoder(
decoder_input_tokens,
encoded,
decode_position=decode_position,
......@@ -1386,12 +1520,15 @@ class T5Transformer(Module):
max_decode_len=max_decode_len,
decode=decode,
training=training)
return dict(logits=logits, encoded=encoded, cache=cache)
outputs["encoded"] = encoded
return outputs
@tf.Module.with_name_scope
def __call__(self,
encoder_input_tokens,
decoder_target_tokens,
encoder_input_tokens=None,
decoder_target_tokens=None,
encoder_dense_inputs=None,
encoder_dense_segment_ids=None,
decoder_input_tokens=None,
encoder_segment_ids=None,
decoder_segment_ids=None,
......@@ -1401,9 +1538,12 @@ class T5Transformer(Module):
Args:
encoder_input_tokens: input tokens to the encoder.
decoder_target_tokens: target tokens to the decoder.
encoder_dense_inputs: input dense vectors to the encoder.
encoder_dense_segment_ids: dense input segmentation info for packed
decoder_input_tokens: input tokens to the decoder, only required for
training.
encoder_segment_ids: input segmentation info for packed examples.
examples.
decoder_segment_ids: target segmentation info for packed examples.
training: whether it is training pass, affecting dropouts.
......@@ -1411,15 +1551,19 @@ class T5Transformer(Module):
a dictionary of logits/cache.
"""
encoded = self.encode(
encoder_input_tokens,
encoder_input_tokens=encoder_input_tokens,
encoder_segment_ids=encoder_segment_ids,
encoder_dense_inputs=encoder_dense_inputs,
encoder_dense_segment_ids=encoder_dense_segment_ids,
training=training)
outputs = self.decode(
encoded=encoded,
decoder_target_tokens=decoder_target_tokens,
encoder_input_tokens=encoder_input_tokens, # only used for masks.
encoder_dense_inputs=encoder_dense_inputs, # only used for masks.
decoder_input_tokens=decoder_input_tokens,
encoder_segment_ids=encoder_segment_ids,
encoder_dense_segment_ids=encoder_dense_segment_ids,
decoder_segment_ids=decoder_segment_ids,
training=training)
outputs["encoded"] = encoded
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -354,6 +354,40 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
encoded = encoder(tf.zeros((4, 8), dtype=tf.int32))
self.assertEqual(encoded.shape, (4, 8, config.d_model))
@parameterized.named_parameters(("bfloat16", tf.bfloat16),
("float32", tf.float32))
def test_encoder_with_dense(self, dtype):
config = t5.T5TransformerParams(
num_layers=2,
d_model=4,
d_kv=3,
num_heads=4,
d_ff=16,
vocab_size=10,
vocab_embeddings_initializer=tf.keras.initializers.Ones(),
relative_embeddings_initializer=tf.keras.initializers.Ones())
encoder = t5.Encoder(config, compute_dtype=dtype)
encoded = encoder(
tf.zeros((4, 8), dtype=tf.int32),
dense_inputs=tf.ones((4, 2, 4), dtype=dtype))
self.assertEqual(encoded.shape, (4, 10, config.d_model))
@parameterized.named_parameters(("bfloat16", tf.bfloat16),
("float32", tf.float32))
def test_encoder_only_dense(self, dtype):
config = t5.T5TransformerParams(
num_layers=2,
d_model=4,
d_kv=3,
num_heads=4,
d_ff=16,
vocab_size=10,
vocab_embeddings_initializer=tf.keras.initializers.Ones(),
relative_embeddings_initializer=tf.keras.initializers.Ones())
encoder = t5.Encoder(config, compute_dtype=dtype)
encoded = encoder(dense_inputs=tf.ones((4, 2, 4), dtype=dtype))
self.assertEqual(encoded.shape, (4, 2, config.d_model))
def test_decoder(self):
max_decode_len = 10
config = t5.T5TransformerParams(
......@@ -369,7 +403,9 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
batch_size = 4
targets = tf.zeros((4, 8), dtype=tf.int32)
encoded = tf.zeros((4, 8, config.d_model), dtype=tf.float32)
logits, cache = decoder(targets, encoded)
outputs = decoder(targets, encoded)
logits = outputs["logits"]
cache = outputs["cache"]
self.assertEqual(logits.shape, (4, 8, config.vocab_size))
cache = {}
......@@ -378,13 +414,15 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
cache[1] = _create_cache(batch_size, max_decode_len, config.num_heads,
config.d_kv)
targets = tf.zeros((4, 1), dtype=tf.int32)
logits, cache = decoder(
outputs = decoder(
targets,
encoded,
decode_position=2,
cache=cache,
decode=True,
max_decode_len=max_decode_len)
logits = outputs["logits"]
cache = outputs["cache"]
self.assertEqual(logits.shape, (batch_size, 1, config.vocab_size))
for entry in cache.values():
for tensor in entry.values():
......@@ -445,6 +483,180 @@ class T5Test(tf.test.TestCase, parameterized.TestCase):
print(v.name, v.shape)
self.assertEqual(v.dtype, tf.float32)
@parameterized.named_parameters(
("t5_10_dense", ("relu",), True, 26, False, tf.float32),)
def test_transformer_with_dense(self, ffn_activations, logits_via_embedding,
expect_num_variables, layer_sharing, dtype):
max_decode_len = 10
config = t5.T5TransformerParams(
num_layers=1,
d_model=8,
d_kv=4,
num_heads=4,
d_ff=32,
vocab_size=10,
shared_embedding=True,
layer_sharing=layer_sharing,
ffn_activations=ffn_activations,
logits_via_embedding=logits_via_embedding)
transformer = t5.T5Transformer(config, compute_dtype=dtype)
self.assertLen(transformer.trainable_variables, expect_num_variables)
inputs = tf.convert_to_tensor(
np.array([[2, 2, 1, 3, 1, 0], [3, 3, 1, 2, 2, 1]]))
segments = tf.convert_to_tensor(
np.array([[1, 1, 1, 2, 2, 0], [1, 1, 1, 2, 2, 2]]))
dense_inputs = tf.convert_to_tensor(np.random.randn(2, 2, 8), dtype=dtype)
dense_segments = tf.convert_to_tensor(np.array([[1, 2], [1, 2]]))
outputs = transformer(
encoder_input_tokens=inputs,
encoder_dense_inputs=dense_inputs,
decoder_input_tokens=inputs,
decoder_target_tokens=inputs,
encoder_segment_ids=segments,
encoder_dense_segment_ids=dense_segments,
decoder_segment_ids=segments)
cache = {}
batch_size = 2
cache[0] = _create_cache(
batch_size, max_decode_len, config.num_heads, config.d_kv, dtype=dtype)
outputs = transformer.decode(
encoder_input_tokens=inputs,
encoder_dense_inputs=dense_inputs,
encoded=outputs["encoded"],
decoder_target_tokens=tf.ones((batch_size, 1), dtype=tf.int32),
decode_position=1,
decode=True,
max_decode_len=max_decode_len,
cache=cache)
self.assertEqual(outputs["logits"].shape,
(batch_size, 1, config.vocab_size))
for v in transformer.trainable_variables:
print(v.name, v.shape)
self.assertEqual(v.dtype, tf.float32)
@parameterized.named_parameters(
("t5_10_dense_layerwise_relpos",
("relu",), True, 26, False, tf.float32, False, 1),
("t5_10_dense_shared_relpos_d2",
("relu",), True, 39, False, tf.float32, True, 2),
("t5_10_dense_layerwise_relpos_d2",
("relu",), True, 40, False, tf.float32, False, 2),
)
def test_transformer_with_lw_relpos(self, ffn_activations,
logits_via_embedding,
expect_num_variables, layer_sharing,
dtype, use_shared_relpos,
num_decoder_layers):
max_decode_len = 10
config = t5.T5TransformerParams(
num_layers=1,
num_decoder_layers=num_decoder_layers,
d_model=8,
d_kv=4,
num_heads=4,
d_ff=32,
vocab_size=10,
shared_embedding=True,
layer_sharing=layer_sharing,
ffn_activations=ffn_activations,
logits_via_embedding=logits_via_embedding,
use_shared_relative_position_bias=use_shared_relpos)
transformer = t5.T5Transformer(config, compute_dtype=dtype)
self.assertLen(transformer.trainable_variables, expect_num_variables)
inputs = tf.convert_to_tensor(
np.array([[2, 2, 1, 3, 1, 0], [3, 3, 1, 2, 2, 1]]))
segments = tf.convert_to_tensor(
np.array([[1, 1, 1, 2, 2, 0], [1, 1, 1, 2, 2, 2]]))
dense_inputs = tf.convert_to_tensor(np.random.randn(2, 2, 8), dtype=dtype)
dense_segments = tf.convert_to_tensor(np.array([[1, 2], [1, 2]]))
outputs = transformer(
encoder_input_tokens=inputs,
encoder_dense_inputs=dense_inputs,
decoder_input_tokens=inputs,
decoder_target_tokens=inputs,
encoder_segment_ids=segments,
encoder_dense_segment_ids=dense_segments,
decoder_segment_ids=segments)
cache = {}
batch_size = 2
for i in range(num_decoder_layers):
cache[i] = _create_cache(
batch_size,
max_decode_len,
config.num_heads,
config.d_kv,
dtype=dtype)
outputs = transformer.decode(
encoder_input_tokens=inputs,
encoder_dense_inputs=dense_inputs,
encoded=outputs["encoded"],
decoder_target_tokens=tf.ones((batch_size, 1), dtype=tf.int32),
decode_position=1,
decode=True,
max_decode_len=max_decode_len,
cache=cache)
self.assertEqual(outputs["logits"].shape,
(batch_size, 1, config.vocab_size))
for v in transformer.trainable_variables:
print(v.name, v.shape)
self.assertEqual(v.dtype, tf.float32)
@parameterized.named_parameters(
("t5_10", ("relu",), True, 26, False, tf.float32),)
def test_transformer_with_dense_only(self, ffn_activations,
logits_via_embedding,
expect_num_variables, layer_sharing,
dtype):
max_decode_len = 10
config = t5.T5TransformerParams(
num_layers=1,
d_model=8,
d_kv=4,
num_heads=4,
d_ff=32,
vocab_size=10,
shared_embedding=True,
layer_sharing=layer_sharing,
ffn_activations=ffn_activations,
logits_via_embedding=logits_via_embedding)
transformer = t5.T5Transformer(config, compute_dtype=dtype)
self.assertLen(transformer.trainable_variables, expect_num_variables)
decoder_inputs = tf.convert_to_tensor(
np.array([[2, 2, 1, 3, 1, 0], [3, 3, 1, 2, 2, 1]]))
decoder_segments = tf.convert_to_tensor(
np.array([[1, 1, 1, 2, 2, 0], [1, 1, 1, 2, 2, 2]]))
dense_inputs = tf.convert_to_tensor(np.random.randn(2, 2, 8), dtype=dtype)
dense_segments = tf.convert_to_tensor(np.array([[1, 2], [1, 2]]))
outputs = transformer(
encoder_dense_inputs=dense_inputs,
encoder_dense_segment_ids=dense_segments,
decoder_input_tokens=decoder_inputs,
decoder_target_tokens=decoder_inputs,
decoder_segment_ids=decoder_segments)
cache = {}
batch_size = 2
cache[0] = _create_cache(
batch_size, max_decode_len, config.num_heads, config.d_kv, dtype=dtype)
outputs = transformer.decode(
encoder_dense_inputs=dense_inputs,
encoded=outputs["encoded"],
decoder_target_tokens=tf.ones((batch_size, 1), dtype=tf.int32),
decode_position=1,
decode=True,
max_decode_len=max_decode_len,
cache=cache)
self.assertEqual(outputs["logits"].shape,
(batch_size, 1, config.vocab_size))
for v in transformer.trainable_variables:
print(v.name, v.shape)
self.assertEqual(v.dtype, tf.float32)
@parameterized.named_parameters(
("t5_10", ("relu",), True, 39, tf.float32, 2),
("t5_10_bfloat16", ("relu",), True, 39, tf.bfloat16, 2))
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
......@@ -37,3 +37,8 @@ Generalized Autoregressive Pretraining for Language Understanding"
(https://arxiv.org/abs/1906.08237). It includes embedding lookups,
relative position encodings, mask computations, segment matrix computations and
Transformer XL layers using one or two stream relative self-attention.
* [`FNet`](fnet.py) implements the encoder model from ["FNet: Mixing Tokens with
Fourier Transforms"](https://aclanthology.org/2022.naacl-main.319/). FNet has
the same structure as a Transformer encoder, except that all or most of the
self-attention sublayers are replaced with Fourier sublayers.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -23,6 +23,7 @@ from official.nlp.modeling.networks.bert_encoder import BertEncoder
from official.nlp.modeling.networks.bert_encoder import BertEncoderV2
from official.nlp.modeling.networks.classification import Classification
from official.nlp.modeling.networks.encoder_scaffold import EncoderScaffold
from official.nlp.modeling.networks.fnet import FNet
from official.nlp.modeling.networks.funnel_transformer import FunnelTransformerEncoder
from official.nlp.modeling.networks.mobile_bert_encoder import MobileBERTEncoder
from official.nlp.modeling.networks.packed_sequence_embedding import PackedSequenceEmbedding
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -18,6 +18,7 @@ import collections
import tensorflow as tf
from official.modeling import activations
from official.modeling import tf_utils
from official.nlp.modeling import layers
......@@ -92,13 +93,13 @@ class AlbertEncoder(tf.keras.Model):
embedding_layer = layers.OnDeviceEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
initializer=initializer,
initializer=tf_utils.clone_initializer(initializer),
name='word_embeddings')
word_embeddings = embedding_layer(word_ids)
# Always uses dynamic slicing for simplicity.
position_embedding_layer = layers.PositionEmbedding(
initializer=initializer,
initializer=tf_utils.clone_initializer(initializer),
max_length=max_sequence_length,
name='position_embedding')
position_embeddings = position_embedding_layer(word_embeddings)
......@@ -107,7 +108,7 @@ class AlbertEncoder(tf.keras.Model):
layers.OnDeviceEmbedding(
vocab_size=type_vocab_size,
embedding_width=embedding_width,
initializer=initializer,
initializer=tf_utils.clone_initializer(initializer),
use_one_hot=True,
name='type_embeddings')(type_ids))
......@@ -123,11 +124,11 @@ class AlbertEncoder(tf.keras.Model):
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
if embedding_width != hidden_size:
embeddings = tf.keras.layers.experimental.EinsumDense(
embeddings = tf.keras.layers.EinsumDense(
'...x,xy->...y',
output_shape=hidden_size,
bias_axes='y',
kernel_initializer=initializer,
kernel_initializer=tf_utils.clone_initializer(initializer),
name='embedding_projection')(
embeddings)
......@@ -139,7 +140,7 @@ class AlbertEncoder(tf.keras.Model):
inner_activation=activation,
output_dropout=dropout_rate,
attention_dropout=attention_dropout_rate,
kernel_initializer=initializer,
kernel_initializer=tf_utils.clone_initializer(initializer),
name='transformer')
encoder_outputs = []
for _ in range(num_layers):
......@@ -153,7 +154,7 @@ class AlbertEncoder(tf.keras.Model):
cls_output = tf.keras.layers.Dense(
units=hidden_size,
activation='tanh',
kernel_initializer=initializer,
kernel_initializer=tf_utils.clone_initializer(initializer),
name='pooler_transform')(
first_token_tensor)
if dict_outputs:
......@@ -172,7 +173,7 @@ class AlbertEncoder(tf.keras.Model):
# created using the Functional API. Once super().__init__ is called, we
# can assign attributes to `self` - note that all `self` assignments are
# below this line.
super(AlbertEncoder, self).__init__(
super().__init__(
inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
config_dict = {
'vocab_size': vocab_size,
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment