"vscode:/vscode.git/clone" did not exist on "53787c69eb37877cba025323aa71fdd30aebaea7"
Commit 32e4ca51 authored by qianyj's avatar qianyj
Browse files

Update code to v2.11.0

parents 9485aa1d 71060f67
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
......@@ -20,8 +20,7 @@ examples.
* [`losses`](losses) contains common loss computation used in NLP tasks.
Please see the colab
[nlp_modeling_library_intro.ipynb]
(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb)
[NLP modeling library intro.ipynb](https://colab.sandbox.google.com/github/tensorflow/models/blob/master/docs/nlp/index.ipynb)
for how to build transformer-based NLP models using above primitives.
Besides the pre-defined primitives, it also provides scaffold classes to allow
......@@ -44,8 +43,7 @@ custom hidden layer (which will replace the Transformer instantiation in the
encoder).
Please see the colab
[customize_encoder.ipynb]
(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb)
[customize_encoder.ipynb](https://colab.sandbox.google.com/github/tensorflow/models/blob/master/docs/nlp/customize_encoder.ipynb)
for how to use scaffold classes to build noval achitectures.
BERT and ALBERT models in this repo are implemented using this library.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
......@@ -13,7 +13,7 @@ assemble new `tf.keras` layers or models.
["Big Bird: Transformers for Longer Sequences"](https://arxiv.org/abs/2007.14062).
* [CachedAttention](attention.py) implements an attention layer with cache
used for auto-agressive decoding.
used for auto-aggressive decoding.
* [KernelAttention](kernel_attention.py) implements a group of attention
mechansim that express the self-attention as a linear dot-product of
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -20,7 +20,9 @@ They can be used to assemble new `tf.keras` layers or models.
from official.nlp.modeling.layers.attention import *
from official.nlp.modeling.layers.bigbird_attention import BigBirdAttention
from official.nlp.modeling.layers.bigbird_attention import BigBirdMasks
from official.nlp.modeling.layers.block_diag_feedforward import BlockDiagFeedforward
from official.nlp.modeling.layers.cls_head import *
from official.nlp.modeling.layers.factorized_embedding import FactorizedEmbedding
from official.nlp.modeling.layers.gated_feedforward import GatedFeedforward
from official.nlp.modeling.layers.gaussian_process import RandomFeatureGaussianProcess
from official.nlp.modeling.layers.kernel_attention import KernelAttention
......@@ -28,11 +30,19 @@ from official.nlp.modeling.layers.kernel_attention import KernelMask
from official.nlp.modeling.layers.masked_lm import MaskedLM
from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax
from official.nlp.modeling.layers.mat_mul_with_margin import MatMulWithMargin
from official.nlp.modeling.layers.mixing import FourierTransformLayer
from official.nlp.modeling.layers.mixing import HartleyTransformLayer
from official.nlp.modeling.layers.mixing import LinearTransformLayer
from official.nlp.modeling.layers.mixing import MixingMechanism
from official.nlp.modeling.layers.mobile_bert_layers import MobileBertEmbedding
from official.nlp.modeling.layers.mobile_bert_layers import MobileBertMaskedLM
from official.nlp.modeling.layers.mobile_bert_layers import MobileBertTransformer
from official.nlp.modeling.layers.multi_channel_attention import *
from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
from official.nlp.modeling.layers.pack_optimization import PackBertEmbeddings
from official.nlp.modeling.layers.pack_optimization import StridedTransformerEncoderBlock
from official.nlp.modeling.layers.pack_optimization import StridedTransformerScaffold
from official.nlp.modeling.layers.per_dim_scale_attention import PerDimScaleAttention
from official.nlp.modeling.layers.position_embedding import PositionEmbedding
from official.nlp.modeling.layers.position_embedding import RelativePositionBias
from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding
......@@ -41,6 +51,7 @@ from official.nlp.modeling.layers.relative_attention import TwoStreamRelativeAtt
from official.nlp.modeling.layers.reuse_attention import ReuseMultiHeadAttention
from official.nlp.modeling.layers.reuse_transformer import ReuseTransformer
from official.nlp.modeling.layers.rezero_transformer import ReZeroTransformer
from official.nlp.modeling.layers.routing import *
from official.nlp.modeling.layers.self_attention_mask import SelfAttentionMask
from official.nlp.modeling.layers.spectral_normalization import *
from official.nlp.modeling.layers.talking_heads_attention import TalkingHeadsAttention
......@@ -49,7 +60,8 @@ from official.nlp.modeling.layers.text_layers import BertTokenizer
from official.nlp.modeling.layers.text_layers import FastWordpieceBertTokenizer
from official.nlp.modeling.layers.text_layers import SentencepieceTokenizer
from official.nlp.modeling.layers.tn_transformer_expand_condense import TNTransformerExpandCondense
from official.nlp.modeling.layers.transformer import *
from official.nlp.modeling.layers.transformer import Transformer
from official.nlp.modeling.layers.transformer import TransformerDecoderBlock
from official.nlp.modeling.layers.transformer_encoder_block import TransformerEncoderBlock
from official.nlp.modeling.layers.transformer_scaffold import TransformerScaffold
from official.nlp.modeling.layers.transformer_xl import TransformerXL
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -18,13 +18,13 @@ import math
import tensorflow as tf
EinsumDense = tf.keras.layers.experimental.EinsumDense
EinsumDense = tf.keras.layers.EinsumDense
MultiHeadAttention = tf.keras.layers.MultiHeadAttention
@tf.keras.utils.register_keras_serializable(package="Text")
class CachedAttention(tf.keras.layers.MultiHeadAttention):
"""Attention layer with cache used for auto-agressive decoding.
"""Attention layer with cache used for autoregressive decoding.
Arguments are the same as `tf.keras.layers.MultiHeadAttention` layer.
"""
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Keras-based gated feedforward layer."""
# pylint: disable=g-classes-have-attributes
from typing import Optional
import tensorflow as tf
from official.modeling import tf_utils
class BlockDiagFeedforward(tf.keras.layers.Layer):
"""Block diagonal feedforward layer.
This layer replaces the weight matrix of the output_dense layer with a block
diagonal matrix to save layer parameters and FLOPs. A linear mixing layer can
be added optionally to improve layer expressibility.
Args:
intermediate_size: Size of the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
dropout: Dropout probability for the output dropout.
num_blocks: The number of blocks for the block diagonal matrix of the
output_dense layer.
apply_mixing: Apply linear mixing if True.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
"""
def __init__(
self,
intermediate_size: int,
intermediate_activation: str,
dropout: float,
num_blocks: int = 1,
apply_mixing: bool = True,
kernel_initializer: str = "glorot_uniform",
bias_initializer: str = "zeros",
kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
activity_regularizer: Optional[tf.keras.regularizers.Regularizer] = None,
kernel_constraint: Optional[tf.keras.constraints.Constraint] = None,
bias_constraint: Optional[tf.keras.constraints.Constraint] = None,
**kwargs): # pylint: disable=g-doc-args
super().__init__(**kwargs)
self._intermediate_size = intermediate_size
self._intermediate_activation = intermediate_activation
self._dropout = dropout
self._num_blocks = num_blocks
self._apply_mixing = apply_mixing
if intermediate_size % num_blocks != 0:
raise ValueError("Intermediate_size (%d) isn't a multiple of num_blocks "
"(%d)." % (intermediate_size, num_blocks))
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._bias_initializer = tf.keras.initializers.get(bias_initializer)
self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
def build(self, input_shape):
hidden_size = input_shape.as_list()[-1]
common_kwargs = dict(
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
self._intermediate_dense = tf.keras.layers.EinsumDense(
"abc,cde->abde",
output_shape=(None, self._num_blocks,
self._intermediate_size // self._num_blocks),
bias_axes="de",
name="intermediate",
kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
bias_initializer=tf_utils.clone_initializer(self._bias_initializer),
**common_kwargs)
policy = tf.keras.mixed_precision.global_policy()
if policy.name == "mixed_bfloat16":
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
policy = tf.float32
self._intermediate_activation_layer = tf.keras.layers.Activation(
self._intermediate_activation, dtype=policy)
self._output_dense = tf.keras.layers.EinsumDense(
"abde,deo->abdo",
output_shape=(None, self._num_blocks, hidden_size // self._num_blocks),
bias_axes="do",
name="output",
kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
bias_initializer=tf_utils.clone_initializer(self._bias_initializer),
**common_kwargs)
if self._apply_mixing:
self._output_mixing = tf.keras.layers.EinsumDense(
"abdo,de->abeo",
output_shape=(None, self._num_blocks,
hidden_size // self._num_blocks),
name="output_mixing",
kernel_initializer=tf_utils.clone_initializer(
self._kernel_initializer),
bias_initializer=tf_utils.clone_initializer(self._bias_initializer),
**common_kwargs)
self._output_reshape = tf.keras.layers.Reshape((-1, hidden_size))
self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout)
def get_config(self):
config = {
"intermediate_size":
self._intermediate_size,
"intermediate_activation":
self._intermediate_activation,
"dropout":
self._dropout,
"num_blocks":
self._num_blocks,
"apply_mixing":
self._apply_mixing,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
tf.keras.initializers.serialize(self._bias_initializer),
"kernel_regularizer":
tf.keras.regularizers.serialize(self._kernel_regularizer),
"bias_regularizer":
tf.keras.regularizers.serialize(self._bias_regularizer),
"activity_regularizer":
tf.keras.regularizers.serialize(self._activity_regularizer),
"kernel_constraint":
tf.keras.constraints.serialize(self._kernel_constraint),
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint)
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
intermediate_output = self._intermediate_dense(inputs)
intermediate_output = self._intermediate_activation_layer(
intermediate_output)
layer_output = self._output_dense(intermediate_output)
if self._apply_mixing:
layer_output = self._output_mixing(layer_output)
layer_output = self._output_reshape(layer_output)
layer_output = self._output_dropout(layer_output)
return layer_output
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for Keras-based gated feedforward layer."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import block_diag_feedforward
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class BlockDiagFeedforwardTest(keras_parameterized.TestCase):
def tearDown(self):
super(BlockDiagFeedforwardTest, self).tearDown()
tf.keras.mixed_precision.set_global_policy("float32")
@parameterized.parameters(
(1, True, "float32"),
(1, True, "mixed_float16"),
(1, False, "float32"),
(1, False, "mixed_float16"),
(2, True, "float32"),
(2, True, "mixed_float16"),
(2, False, "float32"),
(2, False, "mixed_float16"),
)
def test_layer_creation(self, num_blocks, apply_mixing, dtype):
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=128,
intermediate_activation="relu",
dropout=0.1,
num_blocks=num_blocks,
apply_mixing=apply_mixing,
kernel_initializer="glorot_uniform",
bias_initializer="zeros")
test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
sequence_length = 64
width = 128
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(data_tensor)
# The default output of a transformer layer should be the same as the input.
self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
@parameterized.parameters(
(1, True, "float32"),
(1, True, "mixed_float16"),
(1, False, "float32"),
(1, False, "mixed_float16"),
(2, True, "float32"),
(2, True, "mixed_float16"),
(2, False, "float32"),
(2, False, "mixed_float16"),
)
def test_layer_invocation(self, num_blocks, apply_mixing, dtype):
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=16,
intermediate_activation="relu",
dropout=0.1,
num_blocks=num_blocks,
apply_mixing=apply_mixing,
kernel_initializer="glorot_uniform",
bias_initializer="zeros")
test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
sequence_length = 16
width = 32
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor = tf.keras.Input(shape=(sequence_length, width))
output_tensor = test_layer(data_tensor)
# Create a model from the test layer.
model = tf.keras.Model(data_tensor, output_tensor)
# Invoke the model on test data.
batch_size = 6
input_data = 10 * np.random.random_sample(
(batch_size, sequence_length, width))
output_data = model.predict(input_data)
self.assertEqual(output_data.shape, (batch_size, sequence_length, width))
def test_get_config(self):
kwargs = dict(
intermediate_size=16,
intermediate_activation="relu",
dropout=0.1,
num_blocks=2,
apply_mixing=True,
kernel_initializer="glorot_uniform",
bias_initializer="zeros")
test_layer = block_diag_feedforward.BlockDiagFeedforward(**kwargs)
new_layer = block_diag_feedforward.BlockDiagFeedforward.from_config(
test_layer.get_config())
self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -57,12 +57,14 @@ class ClassificationHead(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense(
units=self.inner_dim,
activation=self.activation,
kernel_initializer=self.initializer,
kernel_initializer=tf_utils.clone_initializer(self.initializer),
name="pooler_dense")
self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
self.out_proj = tf.keras.layers.Dense(
units=num_classes, kernel_initializer=self.initializer, name="logits")
units=num_classes,
kernel_initializer=tf_utils.clone_initializer(self.initializer),
name="logits")
def call(self, features: tf.Tensor, only_project: bool = False):
"""Implements call().
......@@ -146,14 +148,15 @@ class MultiClsHeads(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense(
units=inner_dim,
activation=self.activation,
kernel_initializer=self.initializer,
kernel_initializer=tf_utils.clone_initializer(self.initializer),
name="pooler_dense")
self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
self.out_projs = []
for name, num_classes in cls_list:
self.out_projs.append(
tf.keras.layers.Dense(
units=num_classes, kernel_initializer=self.initializer,
units=num_classes,
kernel_initializer=tf_utils.clone_initializer(self.initializer),
name=name))
def call(self, features: tf.Tensor, only_project: bool = False):
......@@ -277,7 +280,7 @@ class GaussianProcessClassificationHead(ClassificationHead):
if use_gp_layer:
self.out_proj = gaussian_process.RandomFeatureGaussianProcess(
self.num_classes,
kernel_initializer=self.initializer,
kernel_initializer=tf_utils.clone_initializer(self.initializer),
name="logits",
**self.gp_layer_kwargs)
......@@ -361,3 +364,97 @@ def extract_spec_norm_kwargs(kwargs):
return dict(
iteration=kwargs.pop("iteration", 1),
norm_multiplier=kwargs.pop("norm_multiplier", .99))
class PerQueryDenseHead(tf.keras.layers.Layer):
"""Pooling head used for EncT5 style models.
This module projects each query to use a different projection.
For a input shape= [bs, num_queries, hidden_size], it projects each query to
(features). Ending up with shape= [bs, num_queries, features].
For example, for classification with a few classes, one may use num_queries
as 1 and features as number of classes. For multilabel classification, one
may use num_queries as number of classes and features as 2. So each query
represents a binary classification of one label.
"""
def __init__(self,
num_queries: int,
features: int,
use_bias: bool = False,
kernel_initializer: str = "glorot_uniform",
**kwargs):
"""Initializes the `PerQueryDenseHead`.
Args:
num_queries: number of queries (the learnable embeddings in the input
sequences) from the decoder.
features: int with numbers of output features. Each query with be
projected to this number with a different projection.
use_bias: whether to add a bias to the output.
kernel_initializer: Initializer for dense layer kernels.
**kwargs: Keyword arguments.
"""
super().__init__(**kwargs)
self.num_queries = num_queries
self.features = features
self.use_bias = use_bias
self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
def build(self, input_shape):
input_shape = tf.TensorShape(input_shape)
# Hidden size.
last_dim = tf.compat.dimension_value(input_shape[-1])
self.hidden_size = last_dim
self.kernel = self.add_weight(
"kernel",
shape=[self.num_queries, last_dim, self.features],
initializer=self.kernel_initializer,
dtype=self.dtype,
trainable=True)
if self.use_bias:
self.bias = self.add_weight(
"bias",
shape=[
self.num_queries,
self.features,
],
dtype=self.dtype,
trainable=True)
else:
self.bias = None
def call(self, inputs: tf.Tensor) -> tf.Tensor:
"""Implements call().
Args:
inputs: a rank-3 Tensor of shape= [bs, num_queries, hidden_size].
Returns:
A Tensor, shape= [batch size, num_queries, features].
"""
outputs = tf.einsum("bqh,qhf->bqf", inputs, self.kernel)
if self.use_bias:
outputs += self.bias
return outputs
def get_config(self):
config = {
"num_queries":
self.num_queries,
"features":
self.features,
"kernel_initializer":
tf.keras.activations.serialize(self.kernel_initializer),
}
config.update(super(PerQueryDenseHead, self).get_config())
return config
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -199,5 +199,29 @@ class GaussianProcessClassificationHead(tf.test.TestCase,
self.assertEqual(layer_config["norm_multiplier"], 1.)
self.assertEqual(layer_config["num_inducing"], 512)
class PerQueryDenseHeadTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.named_parameters(("single_query", 1, 3, False),
("multi_queries", 10, 2, False),
("with_bias", 10, 2, True))
def test_layer_invocation(self, num_queries, features, use_bias):
batch_size = 5
hidden_size = 10
layer = cls_head.PerQueryDenseHead(
num_queries=num_queries, features=features, use_bias=use_bias)
inputs = tf.zeros(
shape=(batch_size, num_queries, hidden_size), dtype=tf.float32)
outputs = layer(inputs)
self.assertEqual(outputs.shape, [batch_size, num_queries, features])
def test_layer_serialization(self):
layer = cls_head.PerQueryDenseHead(
num_queries=10, features=2, use_bias=True)
new_layer = cls_head.PerQueryDenseHead.from_config(layer.get_config())
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(layer.get_config(), new_layer.get_config())
if __name__ == "__main__":
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A factorized embedding layer."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
from official.modeling import tf_utils
from official.nlp.modeling.layers import on_device_embedding
@tf.keras.utils.register_keras_serializable(package='Text')
class FactorizedEmbedding(on_device_embedding.OnDeviceEmbedding):
"""A factorized embeddings layer for supporting larger embeddings.
Arguments:
vocab_size: Number of elements in the vocabulary.
embedding_width: Width of word embeddings.
output_dim: The output dimension of this layer.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
lookup. Defaults to False (that is, using tf.gather). Setting this option
to True may improve performance, especially on small vocabulary sizes, but
will generally require more memory.
scale_factor: Whether to scale the output embeddings. Defaults to None (that
is, not to scale). Setting this option to a float will let values in
output embeddings multiplied by scale_factor.
"""
def __init__(self,
vocab_size: int,
embedding_width: int,
output_dim: int,
initializer='glorot_uniform',
use_one_hot=False,
scale_factor=None,
**kwargs):
super().__init__(
vocab_size=vocab_size,
embedding_width=embedding_width,
initializer=initializer,
use_one_hot=use_one_hot,
scale_factor=scale_factor,
**kwargs)
self._output_dim = output_dim
def get_config(self):
config = {'output_dim': self._output_dim}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
self._embedding_projection = tf.keras.layers.EinsumDense(
'...x,xy->...y',
output_shape=self._output_dim,
bias_axes=None,
kernel_initializer=tf_utils.clone_initializer(self._initializer),
name='embedding_projection')
super().build(input_shape)
def call(self, inputs):
output = super().call(inputs)
return self._embedding_projection(output)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for FactorizedEmbedding layer."""
import numpy as np
import tensorflow as tf
from official.nlp.modeling.layers import factorized_embedding
class FactorizedEmbeddingTest(tf.test.TestCase):
def test_layer_creation(self):
vocab_size = 31
embedding_width = 27
output_dim = 45
test_layer = factorized_embedding.FactorizedEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
output_dim=output_dim)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length = 23
input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
output_tensor = test_layer(input_tensor)
# The output should be the same as the input, save that it has an extra
# embedding_width dimension on the end.
expected_output_shape = [None, sequence_length, output_dim]
self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
self.assertEqual(output_tensor.dtype, tf.float32)
def test_layer_invocation(self):
vocab_size = 31
embedding_width = 27
output_dim = 45
test_layer = factorized_embedding.FactorizedEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
output_dim=output_dim)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length = 23
input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
output_tensor = test_layer(input_tensor)
# Create a model from the test layer.
model = tf.keras.Model(input_tensor, output_tensor)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size = 3
input_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
output = model.predict(input_data)
self.assertEqual(tf.float32, output.dtype)
if __name__ == "__main__":
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -18,6 +18,9 @@
import gin
import tensorflow as tf
from official.modeling import tf_utils
from official.nlp.modeling.layers import util
@tf.keras.utils.register_keras_serializable(package="Text")
@gin.configurable
......@@ -55,9 +58,9 @@ class GatedFeedforward(tf.keras.layers.Layer):
"""
def __init__(self,
intermediate_size,
intermediate_activation,
dropout,
inner_dim=768,
inner_activation=tf_utils.get_activation("gelu"),
dropout=0.0,
use_gate=True,
apply_output_layer_norm=True,
num_blocks=1,
......@@ -70,9 +73,12 @@ class GatedFeedforward(tf.keras.layers.Layer):
kernel_constraint=None,
bias_constraint=None,
**kwargs):
super(GatedFeedforward, self).__init__(**kwargs)
self._intermediate_size = intermediate_size
self._intermediate_activation = intermediate_activation
inner_dim = kwargs.pop("intermediate_size", inner_dim)
inner_activation = kwargs.pop("intermediate_activation", inner_activation)
util.filter_kwargs(kwargs)
super().__init__(**kwargs)
self._inner_dim = inner_dim
self._inner_activation = inner_activation
self._dropout = dropout
self._use_gate = use_gate
self._num_blocks = num_blocks
......@@ -95,15 +101,13 @@ class GatedFeedforward(tf.keras.layers.Layer):
hidden_size = input_shape.as_list()[-1]
common_kwargs = dict(
kernel_initializer=self._kernel_initializer,
bias_initializer=self._bias_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
self._intermediate_dense = []
self._intermediate_activation_layers = []
self._inner_activation_layers = []
self._gate_dense = []
self._output_dense = []
self._output_dropout = []
......@@ -116,29 +120,41 @@ class GatedFeedforward(tf.keras.layers.Layer):
activation_policy = tf.float32
for i in range(self._num_blocks):
self._intermediate_dense.append(
tf.keras.layers.experimental.EinsumDense(
tf.keras.layers.EinsumDense(
"abc,cd->abd",
output_shape=(None, self._intermediate_size),
output_shape=(None, self._inner_dim),
bias_axes="d",
name="intermediate_%d" % i,
kernel_initializer=tf_utils.clone_initializer(
self._kernel_initializer),
bias_initializer=tf_utils.clone_initializer(
self._bias_initializer),
**common_kwargs))
self._intermediate_activation_layers.append(
self._inner_activation_layers.append(
tf.keras.layers.Activation(
self._intermediate_activation, dtype=activation_policy))
self._inner_activation, dtype=activation_policy))
if self._use_gate:
self._gate_dense.append(
tf.keras.layers.experimental.EinsumDense(
tf.keras.layers.EinsumDense(
"abc,cd->abd",
output_shape=(None, self._intermediate_size),
output_shape=(None, self._inner_dim),
bias_axes="d",
name="gate_%d" % i,
kernel_initializer=tf_utils.clone_initializer(
self._kernel_initializer),
bias_initializer=tf_utils.clone_initializer(
self._bias_initializer),
**common_kwargs))
self._output_dense.append(
tf.keras.layers.experimental.EinsumDense(
tf.keras.layers.EinsumDense(
"abc,cd->abd",
output_shape=(None, hidden_size),
bias_axes="d",
name="output_%d" % i,
kernel_initializer=tf_utils.clone_initializer(
self._kernel_initializer),
bias_initializer=tf_utils.clone_initializer(
self._bias_initializer),
**common_kwargs))
self._output_dropout.append(tf.keras.layers.Dropout(rate=self._dropout))
# Use float32 in layernorm for numeric stability.
......@@ -152,10 +168,10 @@ class GatedFeedforward(tf.keras.layers.Layer):
def get_config(self):
config = {
"intermediate_size":
self._intermediate_size,
"intermediate_activation":
self._intermediate_activation,
"inner_dim":
self._inner_dim,
"inner_activation":
self._inner_activation,
"dropout":
self._dropout,
"use_gate":
......@@ -179,7 +195,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint)
}
base_config = super(GatedFeedforward, self).get_config()
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
......@@ -187,7 +203,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
for i in range(self._num_blocks):
layer_input = layer_output
intermediate_output = self._intermediate_dense[i](layer_input)
intermediate_output = self._intermediate_activation_layers[i](
intermediate_output = self._inner_activation_layers[i](
intermediate_output)
if self._use_gate:
gated_linear = self._gate_dense[i](layer_input)
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -44,8 +44,8 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
def test_layer_creation(self, use_gate, num_blocks, dropout_position, dtype):
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=128,
intermediate_activation="relu",
inner_dim=128,
inner_activation="relu",
dropout=0.1,
use_gate=use_gate,
num_blocks=num_blocks,
......@@ -76,8 +76,8 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
dtype):
tf.keras.mixed_precision.set_global_policy(dtype)
kwargs = dict(
intermediate_size=16,
intermediate_activation="relu",
inner_dim=16,
inner_activation="relu",
dropout=0.1,
use_gate=use_gate,
num_blocks=num_blocks,
......@@ -104,8 +104,8 @@ class GatedFeedforwardTest(keras_parameterized.TestCase):
def test_serialize_deserialize(self):
kwargs = dict(
intermediate_size=16,
intermediate_activation="relu",
inner_dim=16,
inner_activation="relu",
dropout=0.1,
use_gate=False,
num_blocks=4,
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Definitions for random feature Gaussian process layer."""
import math
import tensorflow as tf
......@@ -117,7 +116,7 @@ class RandomFeatureGaussianProcess(tf.keras.layers.Layer):
name: (string) Layer name.
**gp_output_kwargs: Additional keyword arguments to dense output layer.
"""
super(RandomFeatureGaussianProcess, self).__init__(name=name, dtype=dtype)
super().__init__(name=name, dtype=dtype)
self.units = units
self.num_inducing = num_inducing
......@@ -227,7 +226,7 @@ class RandomFeatureGaussianProcess(tf.keras.layers.Layer):
"""Resets covariance matrix of the GP layer.
This function is useful for reseting the model's covariance matrix at the
begining of a new epoch.
beginning of a new epoch.
"""
self._gp_cov_layer.reset_precision_matrix()
......@@ -381,7 +380,7 @@ class LaplaceRandomFeatureCovariance(tf.keras.layers.Layer):
"""Resets precision matrix to its initial value.
This function is useful for reseting the model's covariance matrix at the
begining of a new epoch.
beginning of a new epoch.
"""
precision_matrix_reset_op = self.precision_matrix.assign(
self.initial_precision_matrix)
......
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Tests for Gaussian process functions."""
import os
import shutil
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment