Unverified Commit 415e9a09 authored by Matt's avatar Matt Committed by GitHub
Browse files

Add tf_keras imports to prepare for Keras 3 (#28588)

* Port core files + ESM (because ESM code is odd)

* Search-replace in modelling code

* Fix up transfo_xl as well

* Fix other core files + tests (still need to add correct import to tests)

* Fix cookiecutter

* make fixup, fix imports in some more core files

* Auto-add imports to tests

* Cleanup, add imports to sagemaker tests

* Use correct exception for importing tf_keras

* Fixes in modeling_tf_utils

* make fixup

* Correct version parsing code

* Ensure the pipeline tests correctly revert to float32 after each test

* Ensure the pipeline tests correctly revert to float32 after each test

* More tf.keras -> keras

* Add dtype cast

* Better imports of tf_keras

* Add a cast for tf.assign, just in case

* Fix callback imports
parent 1d489b3e
......@@ -69,7 +69,6 @@ TensorFlowの[model.save](https://www.tensorflow.org/tutorials/keras/save_and_lo
```py
>>> from transformers import TFPreTrainedModel
>>> from tensorflow import keras
>>> model.save_weights("some_folder/tf_model.h5")
>>> model = TFPreTrainedModel.from_pretrained("some_folder")
......
......@@ -47,6 +47,7 @@ from transformers import (
set_seed,
)
from transformers.keras_callbacks import KerasMetricCallback
from transformers.modeling_tf_utils import keras
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
......@@ -363,7 +364,7 @@ def main():
def _train_transforms(image):
img_size = image_size
image = tf.keras.utils.img_to_array(image)
image = keras.utils.img_to_array(image)
image = random_resized_crop(image, size=img_size)
image = tf.image.random_flip_left_right(image)
image /= 255.0
......@@ -372,7 +373,7 @@ def main():
return image
def _val_transforms(image):
image = tf.keras.utils.img_to_array(image)
image = keras.utils.img_to_array(image)
image = tf.image.resize(image, size=image_size)
# image = np.array(image) # FIXME - use tf.image function
image = center_crop(image, size=image_size)
......
......@@ -22,6 +22,7 @@ import os
import re
import tensorflow as tf
from packaging.version import parse
from transformers import (
AutoConfig,
......@@ -33,6 +34,19 @@ from transformers import (
)
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
logger = logging.getLogger(__name__)
AUTO = tf.data.AUTOTUNE
......@@ -209,7 +223,7 @@ def main(args):
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
if args.bfloat16:
tf.keras.mixed_precision.set_global_policy("mixed_bfloat16")
keras.mixed_precision.set_global_policy("mixed_bfloat16")
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
config = AutoConfig.from_pretrained(args.pretrained_model_config)
......
......@@ -30,6 +30,7 @@ from typing import Optional
import evaluate
import tensorflow as tf
from datasets import load_dataset
from packaging.version import parse
from utils_qa import postprocess_qa_predictions
import transformers
......@@ -48,6 +49,19 @@ from transformers import (
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.38.0.dev0")
......@@ -233,7 +247,7 @@ class DataTrainingArguments:
# region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback):
class SavePretrainedCallback(keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch.
......
......@@ -23,6 +23,20 @@ from unittest import skip
from unittest.mock import patch
import tensorflow as tf
from packaging.version import parse
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
from transformers.testing_utils import TestCasePlus, get_gpu_count, slow
......@@ -115,7 +129,7 @@ class ExamplesTests(TestCasePlus):
with patch.object(sys, "argv", testargs):
run_text_classification.main()
# Reset the mixed precision policy so we don't break other tests
tf.keras.mixed_precision.set_global_policy("float32")
keras.mixed_precision.set_global_policy("float32")
result = get_results(tmp_dir)
self.assertGreaterEqual(result["eval_accuracy"], 0.75)
......
......@@ -27,6 +27,7 @@ from typing import Optional
import numpy as np
from datasets import load_dataset
from packaging.version import parse
from transformers import (
AutoConfig,
......@@ -46,11 +47,24 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" # Reduce the amount of console output
import tensorflow as tf # noqa: E402
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
logger = logging.getLogger(__name__)
# region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback):
class SavePretrainedCallback(keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch.
......
......@@ -15,7 +15,20 @@
import math
import tensorflow as tf
from packaging import version
from packaging.version import parse
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
def _gelu(x):
......@@ -99,12 +112,12 @@ def glu(x, axis=-1):
return a * tf.math.sigmoid(b)
if version.parse(tf.version.VERSION) >= version.parse("2.4"):
if parse(tf.version.VERSION) >= parse("2.4"):
def approximate_gelu_wrap(x):
return tf.keras.activations.gelu(x, approximate=True)
return keras.activations.gelu(x, approximate=True)
gelu = tf.keras.activations.gelu
gelu = keras.activations.gelu
gelu_new = approximate_gelu_wrap
else:
gelu = _gelu
......@@ -119,11 +132,11 @@ ACT2FN = {
"glu": glu,
"mish": mish,
"quick_gelu": quick_gelu,
"relu": tf.keras.activations.relu,
"sigmoid": tf.keras.activations.sigmoid,
"silu": tf.keras.activations.swish,
"swish": tf.keras.activations.swish,
"tanh": tf.keras.activations.tanh,
"relu": keras.activations.relu,
"sigmoid": keras.activations.sigmoid,
"silu": keras.activations.swish,
"swish": keras.activations.swish,
"tanh": keras.activations.tanh,
}
......
......@@ -8,16 +8,16 @@ import numpy as np
import tensorflow as tf
from huggingface_hub import Repository, create_repo
from packaging.version import parse
from tensorflow.keras.callbacks import Callback
from . import IntervalStrategy, PreTrainedTokenizerBase
from .modelcard import TrainingSummary
from .modeling_tf_utils import keras
logger = logging.getLogger(__name__)
class KerasMetricCallback(Callback):
class KerasMetricCallback(keras.callbacks.Callback):
"""
Callback to compute metrics at the end of every epoch. Unlike normal Keras metrics, these do not need to be
compilable by TF. It is particularly useful for common NLP metrics like BLEU and ROUGE that require string
......@@ -265,7 +265,7 @@ class KerasMetricCallback(Callback):
logs.update(metric_output)
class PushToHubCallback(Callback):
class PushToHubCallback(keras.callbacks.Callback):
"""
Callback that will save and push the model to the Hub regularly. By default, it pushes once per epoch, but this can
be changed with the `save_strategy` argument. Pushed models can be accessed like any other model on the hub, such
......
......@@ -704,7 +704,7 @@ class TrainingSummary:
def parse_keras_history(logs):
"""
Parse the `logs` of either a `tf.keras.History` object returned by `model.fit()` or an accumulated logs `dict`
Parse the `logs` of either a `keras.History` object returned by `model.fit()` or an accumulated logs `dict`
passed to the `PushToHubCallback`. Returns lines and logs compatible with those returned by `parse_log_history`.
"""
if hasattr(logs, "history"):
......@@ -800,14 +800,14 @@ def parse_log_history(log_history):
def extract_hyperparameters_from_keras(model):
import tensorflow as tf
from .modeling_tf_utils import keras
hyperparameters = {}
if hasattr(model, "optimizer") and model.optimizer is not None:
hyperparameters["optimizer"] = model.optimizer.get_config()
else:
hyperparameters["optimizer"] = None
hyperparameters["training_precision"] = tf.keras.mixed_precision.global_policy().name
hyperparameters["training_precision"] = keras.mixed_precision.global_policy().name
return hyperparameters
......
......@@ -260,7 +260,6 @@ def load_pytorch_state_dict_in_tf2_model(
"""Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading
safetensors archive created with the safe_open() function."""
import tensorflow as tf
from keras import backend as K
if tf_inputs is None:
tf_inputs = tf_model.dummy_inputs
......@@ -360,7 +359,7 @@ def load_pytorch_state_dict_in_tf2_model(
tf_loaded_numel += tensor_size(array)
K.set_value(symbolic_weight, array)
symbolic_weight.assign(tf.cast(array, symbolic_weight.dtype))
del array # Immediately free memory to keep peak usage as low as possible
all_pytorch_weights.discard(name)
......
......@@ -33,7 +33,6 @@ import h5py
import numpy as np
import tensorflow as tf
from huggingface_hub import Repository, list_repo_files
from keras import backend as K
from packaging.version import parse
from . import DataCollatorWithPadding, DefaultDataCollator
......@@ -79,6 +78,20 @@ if is_safetensors_available():
if TYPE_CHECKING:
from . import PreTrainedTokenizerBase
try:
import tf_keras as keras
from tf_keras import backend as K
except (ModuleNotFoundError, ImportError):
import keras
from keras import backend as K
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
logger = logging.get_logger(__name__)
tf_logger = tf.get_logger()
......@@ -103,7 +116,7 @@ def dummy_loss(y_true, y_pred):
class TFModelUtilsMixin:
"""
A few utilities for `tf.keras.Model`, to be used as a mixin.
A few utilities for `keras.Model`, to be used as a mixin.
"""
def num_parameters(self, only_trainable: bool = False) -> int:
......@@ -134,10 +147,10 @@ def keras_serializable(cls):
2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
convert it to a config object for the actual layer initializer.
3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not
need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`.
need to be supplied in `custom_objects` in the call to `keras.models.load_model`.
Args:
cls (a `tf.keras.layers.Layers subclass`):
cls (a `keras.layers.Layers subclass`):
Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its
initializer.
......@@ -171,7 +184,7 @@ def keras_serializable(cls):
cls.__init__ = wrapped_init
if not hasattr(cls, "get_config"):
raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses")
raise TypeError("Only use @keras_serializable on keras.layers.Layer subclasses")
if hasattr(cls.get_config, "_is_default"):
def get_config(self):
......@@ -183,8 +196,8 @@ def keras_serializable(cls):
cls.get_config = get_config
cls._keras_serializable = True
if hasattr(tf.keras.utils, "register_keras_serializable"):
cls = tf.keras.utils.register_keras_serializable()(cls)
if hasattr(keras.utils, "register_keras_serializable"):
cls = keras.utils.register_keras_serializable()(cls)
return cls
......@@ -200,9 +213,7 @@ class TFCausalLanguageModelingLoss:
"""
def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
if self.config.tf_legacy_loss:
# make sure only labels that are not equal to -100 affect the loss
active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100)
......@@ -225,9 +236,7 @@ class TFQuestionAnsweringLoss:
"""
def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
start_loss = loss_fn(labels["start_position"], logits[0])
end_loss = loss_fn(labels["end_position"], logits[1])
......@@ -246,9 +255,7 @@ class TFTokenClassificationLoss:
"""
def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
if tf.executing_eagerly(): # Data-dependent conditionals are forbidden in XLA
if tf.math.reduce_any(labels == -1):
tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.")
......@@ -285,13 +292,13 @@ class TFSequenceClassificationLoss:
def hf_compute_loss(self, labels, logits):
if logits.shape.rank == 1 or logits.shape[1] == 1:
loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
loss_fn = keras.losses.MeanSquaredError(reduction=keras.losses.Reduction.NONE)
if labels.shape.rank == 1:
# MeanSquaredError returns a scalar loss if the labels are 1D, so avoid that
labels = tf.expand_dims(labels, axis=-1)
else:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
loss_fn = keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=keras.losses.Reduction.NONE
)
return loss_fn(labels, logits)
......@@ -301,9 +308,7 @@ class TFMultipleChoiceLoss:
"""Loss function suitable for multiple choice tasks."""
def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
return loss_fn(labels, logits)
......@@ -331,9 +336,7 @@ class TFNextSentencePredictionLoss:
"""
def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
if self.config.tf_legacy_loss:
# make sure only labels that are not equal to -100
# are taken into account as loss
......@@ -435,7 +438,7 @@ def unpack_inputs(func):
def input_processing(func, config, **kwargs):
"""
Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input
has to be named accordingly to the parameters name, i.e. `input_ids = tf.keras.Input(shape=(128,), dtype='int32',
has to be named accordingly to the parameters name, i.e. `input_ids = keras.Input(shape=(128,), dtype='int32',
name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training.
Args:
......@@ -710,7 +713,7 @@ def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, s
loaded in the model.
Args:
model (`tf.keras.models.Model`): The model in which to load the checkpoint.
model (`keras.models.Model`): The model in which to load the checkpoint.
shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names.
ignore_mismatched_sizes`bool`, *optional`, defaults to `True`):
Whether or not to ignore the mismatch between the sizes
......@@ -773,13 +776,13 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
Loads a shard from a sharded checkpoint file. Handles the missing keys and unexpected keys.
Args:
model (`tf.keras.models.Model`): Model in which the weights are loaded
model (`keras.models.Model`): Model in which the weights are loaded
model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model.
resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded
ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys
Returns:
`tf.keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
`keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
shard file), one for the mismatched layers, and another one for the unexpected layers.
"""
saved_weight_names_set = set()
......@@ -862,7 +865,7 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
shapes.
Args:
model (`tf.keras.models.Model`):
model (`keras.models.Model`):
The model to load the weights into.
resolved_archive_file (`str`):
The location of the H5 file.
......@@ -1055,7 +1058,7 @@ def init_copy_embeddings(old_embeddings, new_num_tokens):
return mask, current_weights
class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushToHubMixin):
class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushToHubMixin):
r"""
Base class for all TF models.
......@@ -1295,7 +1298,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return False
return True
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
"""
Returns the model's input embeddings layer.
......@@ -1505,7 +1508,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
self._using_dummy_loss = True
else:
self._using_dummy_loss = False
parent_args = list(inspect.signature(tf.keras.Model.compile).parameters.keys())
parent_args = list(inspect.signature(keras.Model.compile).parameters.keys())
# This argument got renamed, we need to support both versions
if "steps_per_execution" in parent_args:
super().compile(
......@@ -1531,7 +1534,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
)
def compute_loss(self, *args, **kwargs):
if hasattr(tf.keras.Model, "compute_loss"):
if hasattr(keras.Model, "compute_loss"):
# This will be true in TF 2.8 or greater
return super().compute_loss(*args, **kwargs)
else:
......@@ -1575,7 +1578,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"):
# Newer TF train steps leave this out
data = expand_1d(data)
x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)
# If the inputs are mutable dictionaries, make a shallow copy of them because we will modify
# them during input/label pre-processing. This avoids surprising the user by wrecking their data.
# In addition, modifying mutable Python inputs makes XLA compilation impossible.
......@@ -1682,7 +1685,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"):
# Newer versions leave this out
data = expand_1d(data)
x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data)
x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)
# If the inputs are mutable dictionaries, make a shallow copy of them because we will modify
# them during input/label pre-processing. This avoids surprising the user by wrecking their data.
# In addition, modifying mutable Python inputs makes XLA compilation impossible.
......@@ -1851,7 +1854,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
self.build_in_name_scope()
main_layer.set_input_embeddings(value)
def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
def get_output_embeddings(self) -> Union[None, keras.layers.Layer]:
"""
Returns the model's output embeddings
......@@ -1888,13 +1891,13 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
self.build_in_name_scope()
lm_head.set_output_embeddings(value)
def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
def get_output_layer_with_bias(self) -> Union[None, keras.layers.Layer]:
"""
Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the
embeddings
Return:
`tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
`keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
"""
warnings.warn(
"The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
......@@ -1944,18 +1947,18 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
self.build_in_name_scope()
lm_head.set_bias(value)
def get_lm_head(self) -> tf.keras.layers.Layer:
def get_lm_head(self) -> keras.layers.Layer:
"""
The LM Head layer. This method must be overwritten by all the models that have a lm head.
Return:
`tf.keras.layers.Layer`: The LM head layer if the model has one, None if not.
`keras.layers.Layer`: The LM head layer if the model has one, None if not.
"""
return None
def resize_token_embeddings(
self, new_num_tokens: Optional[int] = None
) -> Union[tf.keras.layers.Embedding, tf.Variable]:
) -> Union[keras.layers.Embedding, tf.Variable]:
"""
Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
......@@ -1968,12 +1971,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
returns a pointer to the input tokens without doing anything.
Return:
`tf.Variable` or `tf.keras.layers.Embedding`: Pointer to the input tokens of the model.
`tf.Variable` or `keras.layers.Embedding`: Pointer to the input tokens of the model.
"""
# TODO (joao): flagged for replacement (by `_v2_resized_token_embeddings`) due to embeddings refactor
# Run the new code path if the model has a keras embeddings layer
if isinstance(self.get_input_embeddings(), tf.keras.layers.Embedding):
if isinstance(self.get_input_embeddings(), keras.layers.Embedding):
return self._v2_resized_token_embeddings(new_num_tokens)
if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
......@@ -1986,7 +1989,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return model_embeds
def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> tf.keras.layers.Embedding:
def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> keras.layers.Embedding:
"""
Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
......@@ -1997,7 +2000,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
returns a pointer to the input tokens without doing anything.
Return:
`tf.keras.layers.Embedding`: Pointer to the input tokens of the model.
`keras.layers.Embedding`: Pointer to the input tokens of the model.
"""
if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
return self.get_input_embeddings()
......@@ -2245,20 +2248,20 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return new_embeddings
def _v2_get_resized_embeddings(
self, old_embeddings: tf.keras.layers.Embedding, new_num_tokens: int
) -> tf.keras.layers.Embedding:
self, old_embeddings: keras.layers.Embedding, new_num_tokens: int
) -> keras.layers.Embedding:
"""
Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized
vectors at the end. Reducing the size will remove vectors from the end.
Args:
old_embeddings (`tf.keras.layers.Embedding`):
old_embeddings (`keras.layers.Embedding`):
Old embeddings to be resized.
new_num_tokens (`int`, *optional*):
New number of tokens in the embedding matrix.
Return:
`tf.keras.layers.Embedding`: Resized Embedding layer.
`keras.layers.Embedding`: Resized Embedding layer.
"""
# Get the initialization range for the embeddings
......@@ -2273,10 +2276,10 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
init_range = getattr(self.config, var_name)
# Get a new (initialized) embeddings layer
new_embeddings = tf.keras.layers.Embedding(
new_embeddings = keras.layers.Embedding(
input_dim=new_num_tokens,
output_dim=old_embeddings.output_dim,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=init_range),
embeddings_initializer=keras.initializers.TruncatedNormal(stddev=init_range),
name=old_embeddings.embeddings.name[:-13], # exact same scoped name except "/embeddings:0"
)
new_embeddings(tf.constant([[0]]))
......@@ -3184,7 +3187,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
cls._auto_class = auto_class
class TFConv1D(tf.keras.layers.Layer):
class TFConv1D(keras.layers.Layer):
"""
1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
......@@ -3198,7 +3201,7 @@ class TFConv1D(tf.keras.layers.Layer):
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation to use to initialize the weights.
kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
"""
def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
......@@ -3227,7 +3230,7 @@ class TFConv1D(tf.keras.layers.Layer):
return x
class TFSharedEmbeddings(tf.keras.layers.Layer):
class TFSharedEmbeddings(keras.layers.Layer):
r"""
Construct shared token embeddings.
......@@ -3243,7 +3246,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
The standard deviation to use when initializing the weights. If no value is provided, it will default to
\\(1/\sqrt{hidden\_size}\\).
kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
"""
# TODO (joao): flagged for delection due to embeddings refactor
......@@ -3254,7 +3257,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size
self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
warnings.warn(
"`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `tf.keras.layers.Embedding` instead.",
"`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `keras.layers.Embedding` instead.",
DeprecationWarning,
)
......@@ -3331,7 +3334,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
return tf.reshape(logits, first_dims + [self.vocab_size])
class TFSequenceSummary(tf.keras.layers.Layer):
class TFSequenceSummary(keras.layers.Layer):
"""
Compute a single vector summary of a sequence hidden states.
......@@ -3358,7 +3361,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
"""
def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs):
......@@ -3377,7 +3380,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
num_classes = config.num_labels
else:
num_classes = config.hidden_size
self.summary = tf.keras.layers.Dense(
self.summary = keras.layers.Dense(
num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
)
......@@ -3389,11 +3392,11 @@ class TFSequenceSummary(tf.keras.layers.Layer):
self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
if self.has_first_dropout:
self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
self.first_dropout = keras.layers.Dropout(config.summary_first_dropout)
self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
if self.has_last_dropout:
self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
self.last_dropout = keras.layers.Dropout(config.summary_last_dropout)
self.hidden_size = config.hidden_size
def call(self, inputs, cls_index=None, training=False):
......@@ -3456,14 +3459,14 @@ class TFSequenceSummary(tf.keras.layers.Layer):
self.summary.build(self.hidden_size)
def get_initializer(initializer_range: float = 0.02) -> tf.keras.initializers.TruncatedNormal:
def get_initializer(initializer_range: float = 0.02) -> keras.initializers.TruncatedNormal:
"""
Creates a `tf.keras.initializers.TruncatedNormal` with the given range.
Creates a `keras.initializers.TruncatedNormal` with the given range.
Args:
initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.
Returns:
`tf.keras.initializers.TruncatedNormal`: The truncated normal initializer.
`keras.initializers.TruncatedNormal`: The truncated normal initializer.
"""
return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
return keras.initializers.TruncatedNormal(stddev=initializer_range)
......@@ -44,6 +44,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -84,9 +85,7 @@ class TFAlbertPreTrainingLoss:
"""
def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
if self.config.tf_legacy_loss:
# make sure only labels that are not equal to -100
# are taken into account as loss
......@@ -133,7 +132,7 @@ class TFAlbertPreTrainingLoss:
return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,))
class TFAlbertEmbeddings(tf.keras.layers.Layer):
class TFAlbertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: AlbertConfig, **kwargs):
......@@ -143,8 +142,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
......@@ -217,7 +216,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
return final_embeddings
class TFAlbertAttention(tf.keras.layers.Layer):
class TFAlbertAttention(keras.layers.Layer):
"""Contains the complete attention sublayer, including both dropouts and layer norm."""
def __init__(self, config: AlbertConfig, **kwargs):
......@@ -235,22 +234,22 @@ class TFAlbertAttention(tf.keras.layers.Layer):
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.output_attentions = config.output_attentions
self.query = tf.keras.layers.Dense(
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = tf.keras.layers.Dense(
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = tf.keras.layers.Dense(
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.attention_dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.output_dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
......@@ -334,12 +333,12 @@ class TFAlbertAttention(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFAlbertLayer(tf.keras.layers.Layer):
class TFAlbertLayer(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
self.attention = TFAlbertAttention(config, name="attention")
self.ffn = tf.keras.layers.Dense(
self.ffn = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
)
......@@ -348,13 +347,13 @@ class TFAlbertLayer(tf.keras.layers.Layer):
else:
self.activation = config.hidden_act
self.ffn_output = tf.keras.layers.Dense(
self.ffn_output = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
)
self.full_layer_layer_norm = tf.keras.layers.LayerNormalization(
self.full_layer_layer_norm = keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
)
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(
......@@ -401,7 +400,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
class TFAlbertLayerGroup(tf.keras.layers.Layer):
class TFAlbertLayerGroup(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
......@@ -453,7 +452,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
layer.build(None)
class TFAlbertTransformer(tf.keras.layers.Layer):
class TFAlbertTransformer(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
......@@ -461,7 +460,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
self.num_hidden_groups = config.num_hidden_groups
# Number of layers in a hidden group
self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups)
self.embedding_hidden_mapping_in = tf.keras.layers.Dense(
self.embedding_hidden_mapping_in = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="embedding_hidden_mapping_in",
......@@ -534,13 +533,13 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "albert"
class TFAlbertMLMHead(tf.keras.layers.Layer):
def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
class TFAlbertMLMHead(keras.layers.Layer):
def __init__(self, config: AlbertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
self.embedding_size = config.embedding_size
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
......@@ -548,7 +547,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
else:
self.activation = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
......@@ -570,7 +569,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size])
def get_output_embeddings(self) -> tf.keras.layers.Layer:
def get_output_embeddings(self) -> keras.layers.Layer:
return self.decoder
def set_output_embeddings(self, value: tf.Variable):
......@@ -599,7 +598,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
@keras_serializable
class TFAlbertMainLayer(tf.keras.layers.Layer):
class TFAlbertMainLayer(keras.layers.Layer):
config_class = AlbertConfig
def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs):
......@@ -610,7 +609,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
self.encoder = TFAlbertTransformer(config, name="encoder")
self.pooler = (
tf.keras.layers.Dense(
keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
......@@ -620,7 +619,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
else None
)
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
......@@ -776,7 +775,7 @@ ALBERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -942,7 +941,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
def get_lm_head(self) -> tf.keras.layers.Layer:
def get_lm_head(self) -> keras.layers.Layer:
return self.predictions
@unpack_inputs
......@@ -1032,12 +1031,12 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
self.sop_classifier.build(None)
class TFAlbertSOPHead(tf.keras.layers.Layer):
class TFAlbertSOPHead(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs)
self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
......@@ -1070,7 +1069,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
def get_lm_head(self) -> tf.keras.layers.Layer:
def get_lm_head(self) -> keras.layers.Layer:
return self.predictions
@unpack_inputs
......@@ -1184,8 +1183,8 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, name="albert")
self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......@@ -1283,8 +1282,8 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(rate=classifier_dropout_prob)
self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......@@ -1372,7 +1371,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.qa_outputs = tf.keras.layers.Dense(
self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
self.config = config
......@@ -1478,8 +1477,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
super().__init__(config, *inputs, **kwargs)
self.albert = TFAlbertMainLayer(config, name="albert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......
......@@ -38,6 +38,7 @@ from ...modeling_tf_utils import (
TFModelInputType,
TFPreTrainedModel,
TFSequenceClassificationLoss,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -116,7 +117,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding):
class TFBartLearnedPositionalEmbedding(keras.layers.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
......@@ -143,7 +144,7 @@ class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding):
return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype))
class TFBartAttention(tf.keras.layers.Layer):
class TFBartAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need"""
def __init__(
......@@ -159,7 +160,7 @@ class TFBartAttention(tf.keras.layers.Layer):
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout)
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
......@@ -169,10 +170,10 @@ class TFBartAttention(tf.keras.layers.Layer):
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
......@@ -313,20 +314,20 @@ class TFBartAttention(tf.keras.layers.Layer):
self.out_proj.build([None, None, self.embed_dim])
class TFBartEncoderLayer(tf.keras.layers.Layer):
class TFBartEncoderLayer(keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFBartAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
......@@ -390,7 +391,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartDecoderLayer(tf.keras.layers.Layer):
class TFBartDecoderLayer(keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
......@@ -401,11 +402,11 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
name="self_attn",
is_decoder=True,
)
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBartAttention(
self.embed_dim,
config.decoder_attention_heads,
......@@ -413,10 +414,10 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
name="encoder_attn",
is_decoder=True,
)
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
......@@ -526,21 +527,21 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartClassificationHead(tf.keras.layers.Layer):
class TFBartClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name: str, **kwargs):
super().__init__(name=name, **kwargs)
self.dense = tf.keras.layers.Dense(inner_dim, name="dense")
self.dropout = tf.keras.layers.Dropout(pooler_dropout)
self.out_proj = tf.keras.layers.Dense(num_classes, name="out_proj")
self.dense = keras.layers.Dense(inner_dim, name="dense")
self.dropout = keras.layers.Dropout(pooler_dropout)
self.out_proj = keras.layers.Dense(num_classes, name="out_proj")
self.input_dim = inner_dim
self.inner_dim = inner_dim
def call(self, inputs):
hidden_states = self.dropout(inputs)
hidden_states = self.dense(hidden_states)
hidden_states = tf.keras.activations.tanh(hidden_states)
hidden_states = keras.activations.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
......@@ -583,7 +584,7 @@ BART_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -740,7 +741,7 @@ BART_INPUTS_DOCSTRING = r"""
@keras_serializable
class TFBartEncoder(tf.keras.layers.Layer):
class TFBartEncoder(keras.layers.Layer):
config_class = BartConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
......@@ -750,10 +751,10 @@ class TFBartEncoder(tf.keras.layers.Layer):
config: BartConfig
"""
def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs):
def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
......@@ -766,7 +767,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
name="embed_positions",
)
self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model
@unpack_inputs
......@@ -900,7 +901,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
@keras_serializable
class TFBartDecoder(tf.keras.layers.Layer):
class TFBartDecoder(keras.layers.Layer):
config_class = BartConfig
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`]
......@@ -910,7 +911,7 @@ class TFBartDecoder(tf.keras.layers.Layer):
embed_tokens: output embedding
"""
def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs):
def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.padding_idx = config.pad_token_id
......@@ -923,9 +924,9 @@ class TFBartDecoder(tf.keras.layers.Layer):
)
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.dropout = keras.layers.Dropout(config.dropout)
@unpack_inputs
def call(
......@@ -1130,16 +1131,16 @@ class TFBartDecoder(tf.keras.layers.Layer):
@keras_serializable
class TFBartMainLayer(tf.keras.layers.Layer):
class TFBartMainLayer(keras.layers.Layer):
config_class = BartConfig
def __init__(self, config: BartConfig, load_weight_prefix=None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.shared = tf.keras.layers.Embedding(
self.shared = keras.layers.Embedding(
input_dim=config.vocab_size,
output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std),
embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared",
)
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
......@@ -1358,9 +1359,9 @@ class TFBartModel(TFBartPretrainedModel):
self.model.build(None)
class BiasLayer(tf.keras.layers.Layer):
class BiasLayer(keras.layers.Layer):
"""
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis,
Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer.
"""
......
......@@ -81,7 +81,7 @@ def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name
if any(x in var_name for x in tensors_to_transpose):
torch_tensor = torch_tensor.T
tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
tf.keras.backend.set_value(tf_var, torch_tensor)
tf_var.assign(tf.cast(torch_tensor, tf_var.dtype))
tf_weight = session.run(tf_var)
print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}")
......
......@@ -49,6 +49,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -121,9 +122,7 @@ class TFBertPreTrainingLoss:
"""
def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
# Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0])
......@@ -143,7 +142,7 @@ class TFBertPreTrainingLoss:
return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,))
class TFBertEmbeddings(tf.keras.layers.Layer):
class TFBertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: BertConfig, **kwargs):
......@@ -153,8 +152,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None):
with tf.name_scope("word_embeddings"):
......@@ -226,7 +225,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
return final_embeddings
class TFBertSelfAttention(tf.keras.layers.Layer):
class TFBertSelfAttention(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
......@@ -241,16 +240,16 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = tf.keras.layers.Dense(
self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = tf.keras.layers.Dense(
self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = tf.keras.layers.Dense(
self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder
self.config = config
......@@ -358,15 +357,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
self.value.build([None, None, self.config.hidden_size])
class TFBertSelfOutput(tf.keras.layers.Layer):
class TFBertSelfOutput(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
......@@ -388,7 +387,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertAttention(tf.keras.layers.Layer):
class TFBertAttention(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
......@@ -439,11 +438,11 @@ class TFBertAttention(tf.keras.layers.Layer):
self.dense_output.build(None)
class TFBertIntermediate(tf.keras.layers.Layer):
class TFBertIntermediate(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
......@@ -468,15 +467,15 @@ class TFBertIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size])
class TFBertOutput(tf.keras.layers.Layer):
class TFBertOutput(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
......@@ -498,7 +497,7 @@ class TFBertOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLayer(tf.keras.layers.Layer):
class TFBertLayer(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
......@@ -601,7 +600,7 @@ class TFBertLayer(tf.keras.layers.Layer):
self.crossattention.build(None)
class TFBertEncoder(tf.keras.layers.Layer):
class TFBertEncoder(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
......@@ -679,11 +678,11 @@ class TFBertEncoder(tf.keras.layers.Layer):
layer.build(None)
class TFBertPooler(tf.keras.layers.Layer):
class TFBertPooler(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
......@@ -708,11 +707,11 @@ class TFBertPooler(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size])
class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
class TFBertPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
......@@ -723,7 +722,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
......@@ -745,8 +744,8 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
class TFBertLMPredictionHead(keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.config = config
......@@ -768,7 +767,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
with tf.name_scope(self.transform.name):
self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer:
def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable):
......@@ -793,8 +792,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
return hidden_states
class TFBertMLMHead(tf.keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
class TFBertMLMHead(keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs)
self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
......@@ -813,11 +812,11 @@ class TFBertMLMHead(tf.keras.layers.Layer):
self.predictions.build(None)
class TFBertNSPHead(tf.keras.layers.Layer):
class TFBertNSPHead(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs)
self.seq_relationship = tf.keras.layers.Dense(
self.seq_relationship = keras.layers.Dense(
units=2,
kernel_initializer=get_initializer(config.initializer_range),
name="seq_relationship",
......@@ -839,7 +838,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
@keras_serializable
class TFBertMainLayer(tf.keras.layers.Layer):
class TFBertMainLayer(keras.layers.Layer):
config_class = BertConfig
def __init__(self, config: BertConfig, add_pooling_layer: bool = True, **kwargs):
......@@ -852,7 +851,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
self.encoder = TFBertEncoder(config, name="encoder")
self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings
def set_input_embeddings(self, value: tf.Variable):
......@@ -1086,7 +1085,7 @@ BERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -1281,7 +1280,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
self.nsp = TFBertNSPHead(config, name="nsp___cls")
self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer:
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
def get_prefix_bias_name(self) -> str:
......@@ -1407,7 +1406,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer:
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
def get_prefix_bias_name(self) -> str:
......@@ -1500,7 +1499,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer:
def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions
def get_prefix_bias_name(self) -> str:
......@@ -1732,8 +1731,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(rate=classifier_dropout)
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
......@@ -1825,8 +1824,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
self.config = config
......@@ -1947,8 +1946,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense(
self.dropout = keras.layers.Dropout(rate=classifier_dropout)
self.classifier = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
......@@ -2045,7 +2044,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.qa_outputs = tf.keras.layers.Dense(
self.qa_outputs = keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="qa_outputs",
......
......@@ -5,10 +5,11 @@ import tensorflow as tf
from tensorflow_text import BertTokenizer as BertTokenizerLayer
from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs
from ...modeling_tf_utils import keras
from .tokenization_bert import BertTokenizer
class TFBertTokenizer(tf.keras.layers.Layer):
class TFBertTokenizer(keras.layers.Layer):
"""
This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
`from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
......
......@@ -36,6 +36,7 @@ from ...modeling_tf_outputs import (
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFPreTrainedModel,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding):
class TFBlenderbotLearnedPositionalEmbedding(keras.layers.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
......@@ -138,7 +139,7 @@ class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Blenderbot
class TFBlenderbotAttention(tf.keras.layers.Layer):
class TFBlenderbotAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need"""
def __init__(
......@@ -154,7 +155,7 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout)
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
......@@ -164,10 +165,10 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
......@@ -309,20 +310,20 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot
class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
class TFBlenderbotEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFBlenderbotAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
......@@ -387,7 +388,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot
class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
class TFBlenderbotDecoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
......@@ -398,11 +399,11 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
name="self_attn",
is_decoder=True,
)
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBlenderbotAttention(
self.embed_dim,
config.decoder_attention_heads,
......@@ -410,10 +411,10 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
name="encoder_attn",
is_decoder=True,
)
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
......@@ -533,7 +534,7 @@ BLENDERBOT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -677,7 +678,7 @@ BLENDERBOT_INPUTS_DOCSTRING = r"""
@keras_serializable
class TFBlenderbotEncoder(tf.keras.layers.Layer):
class TFBlenderbotEncoder(keras.layers.Layer):
config_class = BlenderbotConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
......@@ -687,10 +688,10 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
config: BlenderbotConfig
"""
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs):
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
......@@ -703,7 +704,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
name="embed_positions",
)
self.layers = [TFBlenderbotEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
def get_embed_tokens(self):
return self.embed_tokens
......@@ -849,7 +850,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
@keras_serializable
class TFBlenderbotDecoder(tf.keras.layers.Layer):
class TFBlenderbotDecoder(keras.layers.Layer):
config_class = BlenderbotConfig
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`]
......@@ -859,7 +860,7 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
embed_tokens: output embedding
"""
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs):
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.padding_idx = config.pad_token_id
......@@ -872,9 +873,9 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
)
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBlenderbotDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.dropout = keras.layers.Dropout(config.dropout)
def get_embed_tokens(self):
return self.embed_tokens
......@@ -1090,17 +1091,17 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
@keras_serializable
class TFBlenderbotMainLayer(tf.keras.layers.Layer):
class TFBlenderbotMainLayer(keras.layers.Layer):
config_class = BlenderbotConfig
def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.shared = tf.keras.layers.Embedding(
self.shared = keras.layers.Embedding(
input_dim=config.vocab_size,
output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std),
embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared",
)
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
......@@ -1325,9 +1326,9 @@ class TFBlenderbotModel(TFBlenderbotPreTrainedModel):
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer):
class BiasLayer(keras.layers.Layer):
"""
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis,
Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer.
"""
......
......@@ -35,6 +35,7 @@ from ...modeling_tf_outputs import (
from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFPreTrainedModel,
keras,
keras_serializable,
unpack_inputs,
)
......@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
# Copied from transformers.models.blenderbot.modeling_tf_blenderbot.TFBlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall
class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding):
class TFBlenderbotSmallLearnedPositionalEmbedding(keras.layers.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
......@@ -138,7 +139,7 @@ class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->BlenderbotSmall
class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
class TFBlenderbotSmallAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need"""
def __init__(
......@@ -154,7 +155,7 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout)
self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
......@@ -164,10 +165,10 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
......@@ -309,20 +310,20 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
class TFBlenderbotSmallEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
self.self_attn = TFBlenderbotSmallAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
......@@ -387,7 +388,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
class TFBlenderbotSmallDecoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.d_model
......@@ -398,11 +399,11 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
name="self_attn",
is_decoder=True,
)
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBlenderbotSmallAttention(
self.embed_dim,
config.decoder_attention_heads,
......@@ -410,10 +411,10 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
name="encoder_attn",
is_decoder=True,
)
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config
def call(
......@@ -533,7 +534,7 @@ BLENDERBOT_SMALL_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -681,7 +682,7 @@ BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
@keras_serializable
class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
class TFBlenderbotSmallEncoder(keras.layers.Layer):
config_class = BlenderbotSmallConfig
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
......@@ -691,12 +692,10 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
config: BlenderbotSmallConfig
"""
def __init__(
self, config: BlenderbotSmallConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs
):
def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
......@@ -709,7 +708,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
name="embed_positions",
)
self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model
def get_embed_tokens(self):
......@@ -855,7 +854,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
@keras_serializable
class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
class TFBlenderbotSmallDecoder(keras.layers.Layer):
config_class = BlenderbotSmallConfig
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotSmallDecoderLayer`]
......@@ -865,9 +864,7 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
embed_tokens: output embedding
"""
def __init__(
self, config: BlenderbotSmallConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs
):
def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs)
self.config = config
self.padding_idx = config.pad_token_id
......@@ -880,9 +877,9 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
)
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBlenderbotSmallDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.dropout = keras.layers.Dropout(config.dropout)
def get_embed_tokens(self):
return self.embed_tokens
......@@ -1095,17 +1092,17 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
@keras_serializable
class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer):
class TFBlenderbotSmallMainLayer(keras.layers.Layer):
config_class = BlenderbotSmallConfig
def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
self.shared = tf.keras.layers.Embedding(
self.shared = keras.layers.Embedding(
input_dim=config.vocab_size,
output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std),
embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared",
)
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
......@@ -1314,9 +1311,9 @@ class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel):
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer):
class BiasLayer(keras.layers.Layer):
"""
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis,
Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer.
"""
......
......@@ -27,6 +27,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel,
get_initializer,
get_tf_activation,
keras,
keras_serializable,
shape_list,
unpack_inputs,
......@@ -63,7 +64,7 @@ TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
return tf.math.reduce_mean(
tf.keras.metrics.sparse_categorical_crossentropy(
keras.metrics.sparse_categorical_crossentropy(
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
)
)
......@@ -234,7 +235,7 @@ class TFBlipOutput(ModelOutput):
)
class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
class TFBlipVisionEmbeddings(keras.layers.Layer):
def __init__(self, config: BlipVisionConfig, **kwargs):
super().__init__(**kwargs)
self.config = config
......@@ -242,7 +243,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
self.image_size = config.image_size
self.patch_size = config.patch_size
self.patch_embedding = tf.keras.layers.Conv2D(
self.patch_embedding = keras.layers.Conv2D(
filters=self.embed_dim,
kernel_size=self.patch_size,
strides=self.patch_size,
......@@ -291,7 +292,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip
class TFBlipTextEmbeddings(tf.keras.layers.Layer):
class TFBlipTextEmbeddings(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
......@@ -349,7 +350,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
return final_embeddings
class TFBlipAttention(tf.keras.layers.Layer):
class TFBlipAttention(keras.layers.Layer):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config, **kwargs):
......@@ -364,13 +365,13 @@ class TFBlipAttention(tf.keras.layers.Layer):
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = tf.keras.layers.Dropout(config.attention_dropout, name="dropout")
self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout")
self.qkv = tf.keras.layers.Dense(
self.qkv = keras.layers.Dense(
3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
)
self.projection = tf.keras.layers.Dense(
self.projection = keras.layers.Dense(
self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
)
......@@ -433,7 +434,7 @@ class TFBlipAttention(tf.keras.layers.Layer):
self.projection.build([None, None, self.embed_dim])
class TFBlipMLP(tf.keras.layers.Layer):
class TFBlipMLP(keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs):
super().__init__(**kwargs)
......@@ -442,10 +443,10 @@ class TFBlipMLP(tf.keras.layers.Layer):
in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5)
fc_std = (2 * config.hidden_size) ** -0.5
self.fc1 = tf.keras.layers.Dense(
self.fc1 = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
)
self.fc2 = tf.keras.layers.Dense(
self.fc2 = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
)
self.config = config
......@@ -468,14 +469,14 @@ class TFBlipMLP(tf.keras.layers.Layer):
self.fc2.build([None, None, self.config.intermediate_size])
class TFBlipEncoderLayer(tf.keras.layers.Layer):
class TFBlipEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs):
super().__init__(**kwargs)
self.embed_dim = config.hidden_size
self.self_attn = TFBlipAttention(config, name="self_attn")
self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.mlp = TFBlipMLP(config, name="mlp")
self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
def call(
self,
......@@ -551,7 +552,7 @@ BLIP_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.
......@@ -614,7 +615,7 @@ BLIP_INPUTS_DOCSTRING = r"""
@keras_serializable
class TFBlipEncoder(tf.keras.layers.Layer):
class TFBlipEncoder(keras.layers.Layer):
config_class = BlipConfig
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
......@@ -714,7 +715,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings")
self.encoder = TFBlipEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
......@@ -798,7 +799,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
self.post_layernorm.build([None, None, self.embed_dim])
class TFBlipMainLayer(tf.keras.layers.Layer):
class TFBlipMainLayer(keras.layers.Layer):
config_class = BlipConfig
def __init__(self, config: BlipConfig, *args, **kwargs):
......@@ -826,13 +827,13 @@ class TFBlipMainLayer(tf.keras.layers.Layer):
self.text_model = TFBlipTextModel(text_config, name="text_model")
self.vision_model = TFBlipVisionModel(vision_config, name="vision_model")
self.visual_projection = tf.keras.layers.Dense(
self.visual_projection = keras.layers.Dense(
self.projection_dim,
use_bias=False,
kernel_initializer=get_initializer(config.initializer_range),
name="visual_projection",
)
self.text_projection = tf.keras.layers.Dense(
self.text_projection = keras.layers.Dense(
self.projection_dim,
use_bias=False,
kernel_initializer=get_initializer(config.initializer_range),
......@@ -845,7 +846,7 @@ class TFBlipMainLayer(tf.keras.layers.Layer):
self.logit_scale = self.add_weight(
name="logit_scale",
shape=[],
initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True,
)
......@@ -1116,7 +1117,7 @@ class TFBlipForConditionalGeneration(TFBlipPreTrainedModel):
self.decoder_input_ids = config.text_config.bos_token_id
self.decoder_pad_token_id = config.text_config.pad_token_id
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding
@unpack_inputs
......@@ -1307,7 +1308,7 @@ class TFBlipForQuestionAnswering(TFBlipPreTrainedModel):
self.decoder_pad_token_id = config.text_config.pad_token_id
self.decoder_start_token_id = config.text_config.bos_token_id
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding
# Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right
......@@ -1557,21 +1558,21 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)
# vision projection layer
self.vision_proj = tf.keras.layers.Dense(
self.vision_proj = keras.layers.Dense(
config.image_text_hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="vision_proj",
)
# text projection layer
self.text_proj = tf.keras.layers.Dense(
self.text_proj = keras.layers.Dense(
config.image_text_hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="text_proj",
)
# image text matching head
self.itm_head = tf.keras.layers.Dense(
self.itm_head = keras.layers.Dense(
2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head"
)
......@@ -1587,7 +1588,7 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
)
self.config = config
def get_input_embeddings(self) -> tf.keras.layers.Layer:
def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding
@unpack_inputs
......
......@@ -31,6 +31,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel,
get_initializer,
get_tf_activation,
keras,
keras_serializable,
shape_list,
unpack_inputs,
......@@ -75,18 +76,18 @@ BLIP_TEXT_INPUTS_DOCSTRING = r"""
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52
class TFBlipTextEmbeddings(tf.keras.layers.Layer):
class TFBlipTextEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word and position embeddings."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.word_embeddings = tf.keras.layers.Embedding(
self.word_embeddings = keras.layers.Embedding(
config.vocab_size,
config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range),
name="word_embeddings",
)
self.position_embeddings = tf.keras.layers.Embedding(
self.position_embeddings = keras.layers.Embedding(
config.max_position_embeddings,
config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range),
......@@ -95,8 +96,8 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
# self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
......@@ -146,7 +147,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
class TFBlipTextSelfAttention(tf.keras.layers.Layer):
class TFBlipTextSelfAttention(keras.layers.Layer):
def __init__(self, config, is_cross_attention, **kwargs):
super().__init__(**kwargs)
self.config = config
......@@ -160,21 +161,21 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense(
self.query = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = tf.keras.layers.Dense(
self.key = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = tf.keras.layers.Dense(
self.value = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = tf.keras.layers.Embedding(
self.distance_embedding = keras.layers.Embedding(
2 * config.max_position_embeddings - 1, self.attention_head_size
)
self.is_cross_attention = is_cross_attention
......@@ -291,15 +292,15 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
self.value.build([None, None, self.config.hidden_size])
class TFBlipTextSelfOutput(tf.keras.layers.Layer):
class TFBlipTextSelfOutput(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
......@@ -322,7 +323,7 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242
class TFBlipTextAttention(tf.keras.layers.Layer):
class TFBlipTextAttention(keras.layers.Layer):
def __init__(self, config, is_cross_attention=False, **kwargs):
super().__init__(**kwargs)
self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self")
......@@ -367,11 +368,11 @@ class TFBlipTextAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText
class TFBlipTextIntermediate(tf.keras.layers.Layer):
class TFBlipTextIntermediate(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
......@@ -396,15 +397,15 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size])
class TFBlipTextOutput(tf.keras.layers.Layer):
class TFBlipTextOutput(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
......@@ -426,7 +427,7 @@ class TFBlipTextOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLayer(tf.keras.layers.Layer):
class TFBlipTextLayer(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
......@@ -504,7 +505,7 @@ class TFBlipTextLayer(tf.keras.layers.Layer):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386
@keras_serializable
class TFBlipTextEncoder(tf.keras.layers.Layer):
class TFBlipTextEncoder(keras.layers.Layer):
config_class = BlipTextConfig
def __init__(self, config, name=None, **kwargs):
......@@ -593,11 +594,11 @@ class TFBlipTextEncoder(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText
class TFBlipTextPooler(tf.keras.layers.Layer):
class TFBlipTextPooler(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation="tanh",
......@@ -623,11 +624,11 @@ class TFBlipTextPooler(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText
class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
class TFBlipTextPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
self.dense = keras.layers.Dense(
units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name="dense",
......@@ -638,7 +639,7 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
......@@ -660,14 +661,14 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLMPredictionHead(tf.keras.layers.Layer):
class TFBlipTextLMPredictionHead(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.transform = TFBlipTextPredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = tf.keras.layers.Dense(
self.decoder = keras.layers.Dense(
config.vocab_size,
kernel_initializer=get_initializer(config.initializer_range),
name="decoder",
......@@ -694,7 +695,7 @@ class TFBlipTextLMPredictionHead(tf.keras.layers.Layer):
return hidden_states
class TFBlipTextOnlyMLMHead(tf.keras.layers.Layer):
class TFBlipTextOnlyMLMHead(keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.predictions = TFBlipTextLMPredictionHead(config, name="predictions")
......@@ -1062,7 +1063,7 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
# Keras won't give us label smoothing for sparse CE, so we de-sparsify things here
# Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway)
one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32)
loss_fct = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none")
loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none")
masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32)
lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores)
lm_loss *= masked_positions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment