Unverified Commit 415e9a09 authored by Matt's avatar Matt Committed by GitHub
Browse files

Add tf_keras imports to prepare for Keras 3 (#28588)

* Port core files + ESM (because ESM code is odd)

* Search-replace in modelling code

* Fix up transfo_xl as well

* Fix other core files + tests (still need to add correct import to tests)

* Fix cookiecutter

* make fixup, fix imports in some more core files

* Auto-add imports to tests

* Cleanup, add imports to sagemaker tests

* Use correct exception for importing tf_keras

* Fixes in modeling_tf_utils

* make fixup

* Correct version parsing code

* Ensure the pipeline tests correctly revert to float32 after each test

* Ensure the pipeline tests correctly revert to float32 after each test

* More tf.keras -> keras

* Add dtype cast

* Better imports of tf_keras

* Add a cast for tf.assign, just in case

* Fix callback imports
parent 1d489b3e
...@@ -69,7 +69,6 @@ TensorFlowの[model.save](https://www.tensorflow.org/tutorials/keras/save_and_lo ...@@ -69,7 +69,6 @@ TensorFlowの[model.save](https://www.tensorflow.org/tutorials/keras/save_and_lo
```py ```py
>>> from transformers import TFPreTrainedModel >>> from transformers import TFPreTrainedModel
>>> from tensorflow import keras
>>> model.save_weights("some_folder/tf_model.h5") >>> model.save_weights("some_folder/tf_model.h5")
>>> model = TFPreTrainedModel.from_pretrained("some_folder") >>> model = TFPreTrainedModel.from_pretrained("some_folder")
......
...@@ -47,6 +47,7 @@ from transformers import ( ...@@ -47,6 +47,7 @@ from transformers import (
set_seed, set_seed,
) )
from transformers.keras_callbacks import KerasMetricCallback from transformers.keras_callbacks import KerasMetricCallback
from transformers.modeling_tf_utils import keras
from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version, send_example_telemetry from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version from transformers.utils.versions import require_version
...@@ -363,7 +364,7 @@ def main(): ...@@ -363,7 +364,7 @@ def main():
def _train_transforms(image): def _train_transforms(image):
img_size = image_size img_size = image_size
image = tf.keras.utils.img_to_array(image) image = keras.utils.img_to_array(image)
image = random_resized_crop(image, size=img_size) image = random_resized_crop(image, size=img_size)
image = tf.image.random_flip_left_right(image) image = tf.image.random_flip_left_right(image)
image /= 255.0 image /= 255.0
...@@ -372,7 +373,7 @@ def main(): ...@@ -372,7 +373,7 @@ def main():
return image return image
def _val_transforms(image): def _val_transforms(image):
image = tf.keras.utils.img_to_array(image) image = keras.utils.img_to_array(image)
image = tf.image.resize(image, size=image_size) image = tf.image.resize(image, size=image_size)
# image = np.array(image) # FIXME - use tf.image function # image = np.array(image) # FIXME - use tf.image function
image = center_crop(image, size=image_size) image = center_crop(image, size=image_size)
......
...@@ -22,6 +22,7 @@ import os ...@@ -22,6 +22,7 @@ import os
import re import re
import tensorflow as tf import tensorflow as tf
from packaging.version import parse
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
...@@ -33,6 +34,19 @@ from transformers import ( ...@@ -33,6 +34,19 @@ from transformers import (
) )
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
AUTO = tf.data.AUTOTUNE AUTO = tf.data.AUTOTUNE
...@@ -209,7 +223,7 @@ def main(args): ...@@ -209,7 +223,7 @@ def main(args):
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
if args.bfloat16: if args.bfloat16:
tf.keras.mixed_precision.set_global_policy("mixed_bfloat16") keras.mixed_precision.set_global_policy("mixed_bfloat16")
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
config = AutoConfig.from_pretrained(args.pretrained_model_config) config = AutoConfig.from_pretrained(args.pretrained_model_config)
......
...@@ -30,6 +30,7 @@ from typing import Optional ...@@ -30,6 +30,7 @@ from typing import Optional
import evaluate import evaluate
import tensorflow as tf import tensorflow as tf
from datasets import load_dataset from datasets import load_dataset
from packaging.version import parse
from utils_qa import postprocess_qa_predictions from utils_qa import postprocess_qa_predictions
import transformers import transformers
...@@ -48,6 +49,19 @@ from transformers import ( ...@@ -48,6 +49,19 @@ from transformers import (
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks. # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.38.0.dev0") check_min_version("4.38.0.dev0")
...@@ -233,7 +247,7 @@ class DataTrainingArguments: ...@@ -233,7 +247,7 @@ class DataTrainingArguments:
# region Helper classes # region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback): class SavePretrainedCallback(keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch. # that saves the model with this method after each epoch.
......
...@@ -23,6 +23,20 @@ from unittest import skip ...@@ -23,6 +23,20 @@ from unittest import skip
from unittest.mock import patch from unittest.mock import patch
import tensorflow as tf import tensorflow as tf
from packaging.version import parse
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
from transformers.testing_utils import TestCasePlus, get_gpu_count, slow from transformers.testing_utils import TestCasePlus, get_gpu_count, slow
...@@ -115,7 +129,7 @@ class ExamplesTests(TestCasePlus): ...@@ -115,7 +129,7 @@ class ExamplesTests(TestCasePlus):
with patch.object(sys, "argv", testargs): with patch.object(sys, "argv", testargs):
run_text_classification.main() run_text_classification.main()
# Reset the mixed precision policy so we don't break other tests # Reset the mixed precision policy so we don't break other tests
tf.keras.mixed_precision.set_global_policy("float32") keras.mixed_precision.set_global_policy("float32")
result = get_results(tmp_dir) result = get_results(tmp_dir)
self.assertGreaterEqual(result["eval_accuracy"], 0.75) self.assertGreaterEqual(result["eval_accuracy"], 0.75)
......
...@@ -27,6 +27,7 @@ from typing import Optional ...@@ -27,6 +27,7 @@ from typing import Optional
import numpy as np import numpy as np
from datasets import load_dataset from datasets import load_dataset
from packaging.version import parse
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
...@@ -46,11 +47,24 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" # Reduce the amount of console output ...@@ -46,11 +47,24 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" # Reduce the amount of console output
import tensorflow as tf # noqa: E402 import tensorflow as tf # noqa: E402
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# region Helper classes # region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback): class SavePretrainedCallback(keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
# that saves the model with this method after each epoch. # that saves the model with this method after each epoch.
......
...@@ -15,7 +15,20 @@ ...@@ -15,7 +15,20 @@
import math import math
import tensorflow as tf import tensorflow as tf
from packaging import version from packaging.version import parse
try:
import tf_keras as keras
except (ModuleNotFoundError, ImportError):
import keras
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
def _gelu(x): def _gelu(x):
...@@ -99,12 +112,12 @@ def glu(x, axis=-1): ...@@ -99,12 +112,12 @@ def glu(x, axis=-1):
return a * tf.math.sigmoid(b) return a * tf.math.sigmoid(b)
if version.parse(tf.version.VERSION) >= version.parse("2.4"): if parse(tf.version.VERSION) >= parse("2.4"):
def approximate_gelu_wrap(x): def approximate_gelu_wrap(x):
return tf.keras.activations.gelu(x, approximate=True) return keras.activations.gelu(x, approximate=True)
gelu = tf.keras.activations.gelu gelu = keras.activations.gelu
gelu_new = approximate_gelu_wrap gelu_new = approximate_gelu_wrap
else: else:
gelu = _gelu gelu = _gelu
...@@ -119,11 +132,11 @@ ACT2FN = { ...@@ -119,11 +132,11 @@ ACT2FN = {
"glu": glu, "glu": glu,
"mish": mish, "mish": mish,
"quick_gelu": quick_gelu, "quick_gelu": quick_gelu,
"relu": tf.keras.activations.relu, "relu": keras.activations.relu,
"sigmoid": tf.keras.activations.sigmoid, "sigmoid": keras.activations.sigmoid,
"silu": tf.keras.activations.swish, "silu": keras.activations.swish,
"swish": tf.keras.activations.swish, "swish": keras.activations.swish,
"tanh": tf.keras.activations.tanh, "tanh": keras.activations.tanh,
} }
......
...@@ -8,16 +8,16 @@ import numpy as np ...@@ -8,16 +8,16 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from huggingface_hub import Repository, create_repo from huggingface_hub import Repository, create_repo
from packaging.version import parse from packaging.version import parse
from tensorflow.keras.callbacks import Callback
from . import IntervalStrategy, PreTrainedTokenizerBase from . import IntervalStrategy, PreTrainedTokenizerBase
from .modelcard import TrainingSummary from .modelcard import TrainingSummary
from .modeling_tf_utils import keras
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class KerasMetricCallback(Callback): class KerasMetricCallback(keras.callbacks.Callback):
""" """
Callback to compute metrics at the end of every epoch. Unlike normal Keras metrics, these do not need to be Callback to compute metrics at the end of every epoch. Unlike normal Keras metrics, these do not need to be
compilable by TF. It is particularly useful for common NLP metrics like BLEU and ROUGE that require string compilable by TF. It is particularly useful for common NLP metrics like BLEU and ROUGE that require string
...@@ -265,7 +265,7 @@ class KerasMetricCallback(Callback): ...@@ -265,7 +265,7 @@ class KerasMetricCallback(Callback):
logs.update(metric_output) logs.update(metric_output)
class PushToHubCallback(Callback): class PushToHubCallback(keras.callbacks.Callback):
""" """
Callback that will save and push the model to the Hub regularly. By default, it pushes once per epoch, but this can Callback that will save and push the model to the Hub regularly. By default, it pushes once per epoch, but this can
be changed with the `save_strategy` argument. Pushed models can be accessed like any other model on the hub, such be changed with the `save_strategy` argument. Pushed models can be accessed like any other model on the hub, such
......
...@@ -704,7 +704,7 @@ class TrainingSummary: ...@@ -704,7 +704,7 @@ class TrainingSummary:
def parse_keras_history(logs): def parse_keras_history(logs):
""" """
Parse the `logs` of either a `tf.keras.History` object returned by `model.fit()` or an accumulated logs `dict` Parse the `logs` of either a `keras.History` object returned by `model.fit()` or an accumulated logs `dict`
passed to the `PushToHubCallback`. Returns lines and logs compatible with those returned by `parse_log_history`. passed to the `PushToHubCallback`. Returns lines and logs compatible with those returned by `parse_log_history`.
""" """
if hasattr(logs, "history"): if hasattr(logs, "history"):
...@@ -800,14 +800,14 @@ def parse_log_history(log_history): ...@@ -800,14 +800,14 @@ def parse_log_history(log_history):
def extract_hyperparameters_from_keras(model): def extract_hyperparameters_from_keras(model):
import tensorflow as tf from .modeling_tf_utils import keras
hyperparameters = {} hyperparameters = {}
if hasattr(model, "optimizer") and model.optimizer is not None: if hasattr(model, "optimizer") and model.optimizer is not None:
hyperparameters["optimizer"] = model.optimizer.get_config() hyperparameters["optimizer"] = model.optimizer.get_config()
else: else:
hyperparameters["optimizer"] = None hyperparameters["optimizer"] = None
hyperparameters["training_precision"] = tf.keras.mixed_precision.global_policy().name hyperparameters["training_precision"] = keras.mixed_precision.global_policy().name
return hyperparameters return hyperparameters
......
...@@ -260,7 +260,6 @@ def load_pytorch_state_dict_in_tf2_model( ...@@ -260,7 +260,6 @@ def load_pytorch_state_dict_in_tf2_model(
"""Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading """Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading
safetensors archive created with the safe_open() function.""" safetensors archive created with the safe_open() function."""
import tensorflow as tf import tensorflow as tf
from keras import backend as K
if tf_inputs is None: if tf_inputs is None:
tf_inputs = tf_model.dummy_inputs tf_inputs = tf_model.dummy_inputs
...@@ -360,7 +359,7 @@ def load_pytorch_state_dict_in_tf2_model( ...@@ -360,7 +359,7 @@ def load_pytorch_state_dict_in_tf2_model(
tf_loaded_numel += tensor_size(array) tf_loaded_numel += tensor_size(array)
K.set_value(symbolic_weight, array) symbolic_weight.assign(tf.cast(array, symbolic_weight.dtype))
del array # Immediately free memory to keep peak usage as low as possible del array # Immediately free memory to keep peak usage as low as possible
all_pytorch_weights.discard(name) all_pytorch_weights.discard(name)
......
...@@ -33,7 +33,6 @@ import h5py ...@@ -33,7 +33,6 @@ import h5py
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from huggingface_hub import Repository, list_repo_files from huggingface_hub import Repository, list_repo_files
from keras import backend as K
from packaging.version import parse from packaging.version import parse
from . import DataCollatorWithPadding, DefaultDataCollator from . import DataCollatorWithPadding, DefaultDataCollator
...@@ -79,6 +78,20 @@ if is_safetensors_available(): ...@@ -79,6 +78,20 @@ if is_safetensors_available():
if TYPE_CHECKING: if TYPE_CHECKING:
from . import PreTrainedTokenizerBase from . import PreTrainedTokenizerBase
try:
import tf_keras as keras
from tf_keras import backend as K
except (ModuleNotFoundError, ImportError):
import keras
from keras import backend as K
if parse(keras.__version__).major > 2:
raise ValueError(
"Your currently installed version of Keras is Keras 3, but this is not yet supported in "
"Transformers. Please install the backwards-compatible tf-keras package with "
"`pip install tf-keras`."
)
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
tf_logger = tf.get_logger() tf_logger = tf.get_logger()
...@@ -103,7 +116,7 @@ def dummy_loss(y_true, y_pred): ...@@ -103,7 +116,7 @@ def dummy_loss(y_true, y_pred):
class TFModelUtilsMixin: class TFModelUtilsMixin:
""" """
A few utilities for `tf.keras.Model`, to be used as a mixin. A few utilities for `keras.Model`, to be used as a mixin.
""" """
def num_parameters(self, only_trainable: bool = False) -> int: def num_parameters(self, only_trainable: bool = False) -> int:
...@@ -134,10 +147,10 @@ def keras_serializable(cls): ...@@ -134,10 +147,10 @@ def keras_serializable(cls):
2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and 2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
convert it to a config object for the actual layer initializer. convert it to a config object for the actual layer initializer.
3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not 3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not
need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`. need to be supplied in `custom_objects` in the call to `keras.models.load_model`.
Args: Args:
cls (a `tf.keras.layers.Layers subclass`): cls (a `keras.layers.Layers subclass`):
Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its
initializer. initializer.
...@@ -171,7 +184,7 @@ def keras_serializable(cls): ...@@ -171,7 +184,7 @@ def keras_serializable(cls):
cls.__init__ = wrapped_init cls.__init__ = wrapped_init
if not hasattr(cls, "get_config"): if not hasattr(cls, "get_config"):
raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses") raise TypeError("Only use @keras_serializable on keras.layers.Layer subclasses")
if hasattr(cls.get_config, "_is_default"): if hasattr(cls.get_config, "_is_default"):
def get_config(self): def get_config(self):
...@@ -183,8 +196,8 @@ def keras_serializable(cls): ...@@ -183,8 +196,8 @@ def keras_serializable(cls):
cls.get_config = get_config cls.get_config = get_config
cls._keras_serializable = True cls._keras_serializable = True
if hasattr(tf.keras.utils, "register_keras_serializable"): if hasattr(keras.utils, "register_keras_serializable"):
cls = tf.keras.utils.register_keras_serializable()(cls) cls = keras.utils.register_keras_serializable()(cls)
return cls return cls
...@@ -200,9 +213,7 @@ class TFCausalLanguageModelingLoss: ...@@ -200,9 +213,7 @@ class TFCausalLanguageModelingLoss:
""" """
def hf_compute_loss(self, labels, logits): def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
if self.config.tf_legacy_loss: if self.config.tf_legacy_loss:
# make sure only labels that are not equal to -100 affect the loss # make sure only labels that are not equal to -100 affect the loss
active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100) active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100)
...@@ -225,9 +236,7 @@ class TFQuestionAnsweringLoss: ...@@ -225,9 +236,7 @@ class TFQuestionAnsweringLoss:
""" """
def hf_compute_loss(self, labels, logits): def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
start_loss = loss_fn(labels["start_position"], logits[0]) start_loss = loss_fn(labels["start_position"], logits[0])
end_loss = loss_fn(labels["end_position"], logits[1]) end_loss = loss_fn(labels["end_position"], logits[1])
...@@ -246,9 +255,7 @@ class TFTokenClassificationLoss: ...@@ -246,9 +255,7 @@ class TFTokenClassificationLoss:
""" """
def hf_compute_loss(self, labels, logits): def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
if tf.executing_eagerly(): # Data-dependent conditionals are forbidden in XLA if tf.executing_eagerly(): # Data-dependent conditionals are forbidden in XLA
if tf.math.reduce_any(labels == -1): if tf.math.reduce_any(labels == -1):
tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.") tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.")
...@@ -285,13 +292,13 @@ class TFSequenceClassificationLoss: ...@@ -285,13 +292,13 @@ class TFSequenceClassificationLoss:
def hf_compute_loss(self, labels, logits): def hf_compute_loss(self, labels, logits):
if logits.shape.rank == 1 or logits.shape[1] == 1: if logits.shape.rank == 1 or logits.shape[1] == 1:
loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE) loss_fn = keras.losses.MeanSquaredError(reduction=keras.losses.Reduction.NONE)
if labels.shape.rank == 1: if labels.shape.rank == 1:
# MeanSquaredError returns a scalar loss if the labels are 1D, so avoid that # MeanSquaredError returns a scalar loss if the labels are 1D, so avoid that
labels = tf.expand_dims(labels, axis=-1) labels = tf.expand_dims(labels, axis=-1)
else: else:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE from_logits=True, reduction=keras.losses.Reduction.NONE
) )
return loss_fn(labels, logits) return loss_fn(labels, logits)
...@@ -301,9 +308,7 @@ class TFMultipleChoiceLoss: ...@@ -301,9 +308,7 @@ class TFMultipleChoiceLoss:
"""Loss function suitable for multiple choice tasks.""" """Loss function suitable for multiple choice tasks."""
def hf_compute_loss(self, labels, logits): def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
return loss_fn(labels, logits) return loss_fn(labels, logits)
...@@ -331,9 +336,7 @@ class TFNextSentencePredictionLoss: ...@@ -331,9 +336,7 @@ class TFNextSentencePredictionLoss:
""" """
def hf_compute_loss(self, labels, logits): def hf_compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
if self.config.tf_legacy_loss: if self.config.tf_legacy_loss:
# make sure only labels that are not equal to -100 # make sure only labels that are not equal to -100
# are taken into account as loss # are taken into account as loss
...@@ -435,7 +438,7 @@ def unpack_inputs(func): ...@@ -435,7 +438,7 @@ def unpack_inputs(func):
def input_processing(func, config, **kwargs): def input_processing(func, config, **kwargs):
""" """
Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input
has to be named accordingly to the parameters name, i.e. `input_ids = tf.keras.Input(shape=(128,), dtype='int32', has to be named accordingly to the parameters name, i.e. `input_ids = keras.Input(shape=(128,), dtype='int32',
name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training. name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training.
Args: Args:
...@@ -710,7 +713,7 @@ def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, s ...@@ -710,7 +713,7 @@ def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, s
loaded in the model. loaded in the model.
Args: Args:
model (`tf.keras.models.Model`): The model in which to load the checkpoint. model (`keras.models.Model`): The model in which to load the checkpoint.
shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names. shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names.
ignore_mismatched_sizes`bool`, *optional`, defaults to `True`): ignore_mismatched_sizes`bool`, *optional`, defaults to `True`):
Whether or not to ignore the mismatch between the sizes Whether or not to ignore the mismatch between the sizes
...@@ -773,13 +776,13 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch ...@@ -773,13 +776,13 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
Loads a shard from a sharded checkpoint file. Handles the missing keys and unexpected keys. Loads a shard from a sharded checkpoint file. Handles the missing keys and unexpected keys.
Args: Args:
model (`tf.keras.models.Model`): Model in which the weights are loaded model (`keras.models.Model`): Model in which the weights are loaded
model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model. model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model.
resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded
ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys
Returns: Returns:
`tf.keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the `keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
shard file), one for the mismatched layers, and another one for the unexpected layers. shard file), one for the mismatched layers, and another one for the unexpected layers.
""" """
saved_weight_names_set = set() saved_weight_names_set = set()
...@@ -862,7 +865,7 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False, ...@@ -862,7 +865,7 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
shapes. shapes.
Args: Args:
model (`tf.keras.models.Model`): model (`keras.models.Model`):
The model to load the weights into. The model to load the weights into.
resolved_archive_file (`str`): resolved_archive_file (`str`):
The location of the H5 file. The location of the H5 file.
...@@ -1055,7 +1058,7 @@ def init_copy_embeddings(old_embeddings, new_num_tokens): ...@@ -1055,7 +1058,7 @@ def init_copy_embeddings(old_embeddings, new_num_tokens):
return mask, current_weights return mask, current_weights
class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushToHubMixin): class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushToHubMixin):
r""" r"""
Base class for all TF models. Base class for all TF models.
...@@ -1295,7 +1298,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1295,7 +1298,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return False return False
return True return True
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
""" """
Returns the model's input embeddings layer. Returns the model's input embeddings layer.
...@@ -1505,7 +1508,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1505,7 +1508,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
self._using_dummy_loss = True self._using_dummy_loss = True
else: else:
self._using_dummy_loss = False self._using_dummy_loss = False
parent_args = list(inspect.signature(tf.keras.Model.compile).parameters.keys()) parent_args = list(inspect.signature(keras.Model.compile).parameters.keys())
# This argument got renamed, we need to support both versions # This argument got renamed, we need to support both versions
if "steps_per_execution" in parent_args: if "steps_per_execution" in parent_args:
super().compile( super().compile(
...@@ -1531,7 +1534,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1531,7 +1534,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
) )
def compute_loss(self, *args, **kwargs): def compute_loss(self, *args, **kwargs):
if hasattr(tf.keras.Model, "compute_loss"): if hasattr(keras.Model, "compute_loss"):
# This will be true in TF 2.8 or greater # This will be true in TF 2.8 or greater
return super().compute_loss(*args, **kwargs) return super().compute_loss(*args, **kwargs)
else: else:
...@@ -1575,7 +1578,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1575,7 +1578,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"): if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"):
# Newer TF train steps leave this out # Newer TF train steps leave this out
data = expand_1d(data) data = expand_1d(data)
x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data) x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)
# If the inputs are mutable dictionaries, make a shallow copy of them because we will modify # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify
# them during input/label pre-processing. This avoids surprising the user by wrecking their data. # them during input/label pre-processing. This avoids surprising the user by wrecking their data.
# In addition, modifying mutable Python inputs makes XLA compilation impossible. # In addition, modifying mutable Python inputs makes XLA compilation impossible.
...@@ -1682,7 +1685,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1682,7 +1685,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"): if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"):
# Newer versions leave this out # Newer versions leave this out
data = expand_1d(data) data = expand_1d(data)
x, y, sample_weight = tf.keras.utils.unpack_x_y_sample_weight(data) x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)
# If the inputs are mutable dictionaries, make a shallow copy of them because we will modify # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify
# them during input/label pre-processing. This avoids surprising the user by wrecking their data. # them during input/label pre-processing. This avoids surprising the user by wrecking their data.
# In addition, modifying mutable Python inputs makes XLA compilation impossible. # In addition, modifying mutable Python inputs makes XLA compilation impossible.
...@@ -1851,7 +1854,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1851,7 +1854,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
self.build_in_name_scope() self.build_in_name_scope()
main_layer.set_input_embeddings(value) main_layer.set_input_embeddings(value)
def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]: def get_output_embeddings(self) -> Union[None, keras.layers.Layer]:
""" """
Returns the model's output embeddings Returns the model's output embeddings
...@@ -1888,13 +1891,13 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1888,13 +1891,13 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
self.build_in_name_scope() self.build_in_name_scope()
lm_head.set_output_embeddings(value) lm_head.set_output_embeddings(value)
def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]: def get_output_layer_with_bias(self) -> Union[None, keras.layers.Layer]:
""" """
Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the
embeddings embeddings
Return: Return:
`tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model. `keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
""" """
warnings.warn( warnings.warn(
"The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
...@@ -1944,18 +1947,18 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1944,18 +1947,18 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
self.build_in_name_scope() self.build_in_name_scope()
lm_head.set_bias(value) lm_head.set_bias(value)
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
""" """
The LM Head layer. This method must be overwritten by all the models that have a lm head. The LM Head layer. This method must be overwritten by all the models that have a lm head.
Return: Return:
`tf.keras.layers.Layer`: The LM head layer if the model has one, None if not. `keras.layers.Layer`: The LM head layer if the model has one, None if not.
""" """
return None return None
def resize_token_embeddings( def resize_token_embeddings(
self, new_num_tokens: Optional[int] = None self, new_num_tokens: Optional[int] = None
) -> Union[tf.keras.layers.Embedding, tf.Variable]: ) -> Union[keras.layers.Embedding, tf.Variable]:
""" """
Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
...@@ -1968,12 +1971,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1968,12 +1971,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
returns a pointer to the input tokens without doing anything. returns a pointer to the input tokens without doing anything.
Return: Return:
`tf.Variable` or `tf.keras.layers.Embedding`: Pointer to the input tokens of the model. `tf.Variable` or `keras.layers.Embedding`: Pointer to the input tokens of the model.
""" """
# TODO (joao): flagged for replacement (by `_v2_resized_token_embeddings`) due to embeddings refactor # TODO (joao): flagged for replacement (by `_v2_resized_token_embeddings`) due to embeddings refactor
# Run the new code path if the model has a keras embeddings layer # Run the new code path if the model has a keras embeddings layer
if isinstance(self.get_input_embeddings(), tf.keras.layers.Embedding): if isinstance(self.get_input_embeddings(), keras.layers.Embedding):
return self._v2_resized_token_embeddings(new_num_tokens) return self._v2_resized_token_embeddings(new_num_tokens)
if new_num_tokens is None or new_num_tokens == self.config.vocab_size: if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
...@@ -1986,7 +1989,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1986,7 +1989,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return model_embeds return model_embeds
def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> tf.keras.layers.Embedding: def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> keras.layers.Embedding:
""" """
Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
...@@ -1997,7 +2000,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -1997,7 +2000,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
returns a pointer to the input tokens without doing anything. returns a pointer to the input tokens without doing anything.
Return: Return:
`tf.keras.layers.Embedding`: Pointer to the input tokens of the model. `keras.layers.Embedding`: Pointer to the input tokens of the model.
""" """
if new_num_tokens is None or new_num_tokens == self.config.vocab_size: if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
return self.get_input_embeddings() return self.get_input_embeddings()
...@@ -2245,20 +2248,20 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -2245,20 +2248,20 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
return new_embeddings return new_embeddings
def _v2_get_resized_embeddings( def _v2_get_resized_embeddings(
self, old_embeddings: tf.keras.layers.Embedding, new_num_tokens: int self, old_embeddings: keras.layers.Embedding, new_num_tokens: int
) -> tf.keras.layers.Embedding: ) -> keras.layers.Embedding:
""" """
Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized
vectors at the end. Reducing the size will remove vectors from the end. vectors at the end. Reducing the size will remove vectors from the end.
Args: Args:
old_embeddings (`tf.keras.layers.Embedding`): old_embeddings (`keras.layers.Embedding`):
Old embeddings to be resized. Old embeddings to be resized.
new_num_tokens (`int`, *optional*): new_num_tokens (`int`, *optional*):
New number of tokens in the embedding matrix. New number of tokens in the embedding matrix.
Return: Return:
`tf.keras.layers.Embedding`: Resized Embedding layer. `keras.layers.Embedding`: Resized Embedding layer.
""" """
# Get the initialization range for the embeddings # Get the initialization range for the embeddings
...@@ -2273,10 +2276,10 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -2273,10 +2276,10 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
init_range = getattr(self.config, var_name) init_range = getattr(self.config, var_name)
# Get a new (initialized) embeddings layer # Get a new (initialized) embeddings layer
new_embeddings = tf.keras.layers.Embedding( new_embeddings = keras.layers.Embedding(
input_dim=new_num_tokens, input_dim=new_num_tokens,
output_dim=old_embeddings.output_dim, output_dim=old_embeddings.output_dim,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=init_range), embeddings_initializer=keras.initializers.TruncatedNormal(stddev=init_range),
name=old_embeddings.embeddings.name[:-13], # exact same scoped name except "/embeddings:0" name=old_embeddings.embeddings.name[:-13], # exact same scoped name except "/embeddings:0"
) )
new_embeddings(tf.constant([[0]])) new_embeddings(tf.constant([[0]]))
...@@ -3184,7 +3187,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu ...@@ -3184,7 +3187,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
cls._auto_class = auto_class cls._auto_class = auto_class
class TFConv1D(tf.keras.layers.Layer): class TFConv1D(keras.layers.Layer):
""" """
1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2). 1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
...@@ -3198,7 +3201,7 @@ class TFConv1D(tf.keras.layers.Layer): ...@@ -3198,7 +3201,7 @@ class TFConv1D(tf.keras.layers.Layer):
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation to use to initialize the weights. The standard deviation to use to initialize the weights.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`. Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
""" """
def __init__(self, nf, nx, initializer_range=0.02, **kwargs): def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
...@@ -3227,7 +3230,7 @@ class TFConv1D(tf.keras.layers.Layer): ...@@ -3227,7 +3230,7 @@ class TFConv1D(tf.keras.layers.Layer):
return x return x
class TFSharedEmbeddings(tf.keras.layers.Layer): class TFSharedEmbeddings(keras.layers.Layer):
r""" r"""
Construct shared token embeddings. Construct shared token embeddings.
...@@ -3243,7 +3246,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -3243,7 +3246,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
The standard deviation to use when initializing the weights. If no value is provided, it will default to The standard deviation to use when initializing the weights. If no value is provided, it will default to
\\(1/\sqrt{hidden\_size}\\). \\(1/\sqrt{hidden\_size}\\).
kwargs (`Dict[str, Any]`, *optional*): kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`. Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
""" """
# TODO (joao): flagged for delection due to embeddings refactor # TODO (joao): flagged for delection due to embeddings refactor
...@@ -3254,7 +3257,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -3254,7 +3257,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
warnings.warn( warnings.warn(
"`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `tf.keras.layers.Embedding` instead.", "`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `keras.layers.Embedding` instead.",
DeprecationWarning, DeprecationWarning,
) )
...@@ -3331,7 +3334,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -3331,7 +3334,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
return tf.reshape(logits, first_dims + [self.vocab_size]) return tf.reshape(logits, first_dims + [self.vocab_size])
class TFSequenceSummary(tf.keras.layers.Layer): class TFSequenceSummary(keras.layers.Layer):
""" """
Compute a single vector summary of a sequence hidden states. Compute a single vector summary of a sequence hidden states.
...@@ -3358,7 +3361,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -3358,7 +3361,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights. initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
kwargs (`Dict[str, Any]`, *optional*): kwargs (`Dict[str, Any]`, *optional*):
Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`. Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
""" """
def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs): def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs):
...@@ -3377,7 +3380,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -3377,7 +3380,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
num_classes = config.num_labels num_classes = config.num_labels
else: else:
num_classes = config.hidden_size num_classes = config.hidden_size
self.summary = tf.keras.layers.Dense( self.summary = keras.layers.Dense(
num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
) )
...@@ -3389,11 +3392,11 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -3389,11 +3392,11 @@ class TFSequenceSummary(tf.keras.layers.Layer):
self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0 self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
if self.has_first_dropout: if self.has_first_dropout:
self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout) self.first_dropout = keras.layers.Dropout(config.summary_first_dropout)
self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0 self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
if self.has_last_dropout: if self.has_last_dropout:
self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) self.last_dropout = keras.layers.Dropout(config.summary_last_dropout)
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
def call(self, inputs, cls_index=None, training=False): def call(self, inputs, cls_index=None, training=False):
...@@ -3456,14 +3459,14 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -3456,14 +3459,14 @@ class TFSequenceSummary(tf.keras.layers.Layer):
self.summary.build(self.hidden_size) self.summary.build(self.hidden_size)
def get_initializer(initializer_range: float = 0.02) -> tf.keras.initializers.TruncatedNormal: def get_initializer(initializer_range: float = 0.02) -> keras.initializers.TruncatedNormal:
""" """
Creates a `tf.keras.initializers.TruncatedNormal` with the given range. Creates a `keras.initializers.TruncatedNormal` with the given range.
Args: Args:
initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range. initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.
Returns: Returns:
`tf.keras.initializers.TruncatedNormal`: The truncated normal initializer. `keras.initializers.TruncatedNormal`: The truncated normal initializer.
""" """
return tf.keras.initializers.TruncatedNormal(stddev=initializer_range) return keras.initializers.TruncatedNormal(stddev=initializer_range)
...@@ -44,6 +44,7 @@ from ...modeling_tf_utils import ( ...@@ -44,6 +44,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -84,9 +85,7 @@ class TFAlbertPreTrainingLoss: ...@@ -84,9 +85,7 @@ class TFAlbertPreTrainingLoss:
""" """
def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
if self.config.tf_legacy_loss: if self.config.tf_legacy_loss:
# make sure only labels that are not equal to -100 # make sure only labels that are not equal to -100
# are taken into account as loss # are taken into account as loss
...@@ -133,7 +132,7 @@ class TFAlbertPreTrainingLoss: ...@@ -133,7 +132,7 @@ class TFAlbertPreTrainingLoss:
return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,)) return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,))
class TFAlbertEmbeddings(tf.keras.layers.Layer): class TFAlbertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
...@@ -143,8 +142,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -143,8 +142,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
...@@ -217,7 +216,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -217,7 +216,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
class TFAlbertAttention(tf.keras.layers.Layer): class TFAlbertAttention(keras.layers.Layer):
"""Contains the complete attention sublayer, including both dropouts and layer norm.""" """Contains the complete attention sublayer, including both dropouts and layer norm."""
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
...@@ -235,22 +234,22 @@ class TFAlbertAttention(tf.keras.layers.Layer): ...@@ -235,22 +234,22 @@ class TFAlbertAttention(tf.keras.layers.Layer):
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993 # Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
self.attention_dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.attention_dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.output_dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.output_dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
...@@ -334,12 +333,12 @@ class TFAlbertAttention(tf.keras.layers.Layer): ...@@ -334,12 +333,12 @@ class TFAlbertAttention(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFAlbertLayer(tf.keras.layers.Layer): class TFAlbertLayer(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.attention = TFAlbertAttention(config, name="attention") self.attention = TFAlbertAttention(config, name="attention")
self.ffn = tf.keras.layers.Dense( self.ffn = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
) )
...@@ -348,13 +347,13 @@ class TFAlbertLayer(tf.keras.layers.Layer): ...@@ -348,13 +347,13 @@ class TFAlbertLayer(tf.keras.layers.Layer):
else: else:
self.activation = config.hidden_act self.activation = config.hidden_act
self.ffn_output = tf.keras.layers.Dense( self.ffn_output = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output"
) )
self.full_layer_layer_norm = tf.keras.layers.LayerNormalization( self.full_layer_layer_norm = keras.layers.LayerNormalization(
epsilon=config.layer_norm_eps, name="full_layer_layer_norm" epsilon=config.layer_norm_eps, name="full_layer_layer_norm"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call( def call(
...@@ -401,7 +400,7 @@ class TFAlbertLayer(tf.keras.layers.Layer): ...@@ -401,7 +400,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
self.full_layer_layer_norm.build([None, None, self.config.hidden_size]) self.full_layer_layer_norm.build([None, None, self.config.hidden_size])
class TFAlbertLayerGroup(tf.keras.layers.Layer): class TFAlbertLayerGroup(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -453,7 +452,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): ...@@ -453,7 +452,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
layer.build(None) layer.build(None)
class TFAlbertTransformer(tf.keras.layers.Layer): class TFAlbertTransformer(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -461,7 +460,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer): ...@@ -461,7 +460,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
self.num_hidden_groups = config.num_hidden_groups self.num_hidden_groups = config.num_hidden_groups
# Number of layers in a hidden group # Number of layers in a hidden group
self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups) self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups)
self.embedding_hidden_mapping_in = tf.keras.layers.Dense( self.embedding_hidden_mapping_in = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="embedding_hidden_mapping_in", name="embedding_hidden_mapping_in",
...@@ -534,13 +533,13 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel): ...@@ -534,13 +533,13 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "albert" base_model_prefix = "albert"
class TFAlbertMLMHead(tf.keras.layers.Layer): class TFAlbertMLMHead(keras.layers.Layer):
def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: AlbertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.embedding_size = config.embedding_size self.embedding_size = config.embedding_size
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
if isinstance(config.hidden_act, str): if isinstance(config.hidden_act, str):
...@@ -548,7 +547,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -548,7 +547,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
else: else:
self.activation = config.hidden_act self.activation = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
# an output-only bias for each token. # an output-only bias for each token.
...@@ -570,7 +569,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -570,7 +569,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
with tf.name_scope(self.LayerNorm.name): with tf.name_scope(self.LayerNorm.name):
self.LayerNorm.build([None, None, self.config.embedding_size]) self.LayerNorm.build([None, None, self.config.embedding_size])
def get_output_embeddings(self) -> tf.keras.layers.Layer: def get_output_embeddings(self) -> keras.layers.Layer:
return self.decoder return self.decoder
def set_output_embeddings(self, value: tf.Variable): def set_output_embeddings(self, value: tf.Variable):
...@@ -599,7 +598,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): ...@@ -599,7 +598,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFAlbertMainLayer(tf.keras.layers.Layer): class TFAlbertMainLayer(keras.layers.Layer):
config_class = AlbertConfig config_class = AlbertConfig
def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs): def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs):
...@@ -610,7 +609,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -610,7 +609,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
self.encoder = TFAlbertTransformer(config, name="encoder") self.encoder = TFAlbertTransformer(config, name="encoder")
self.pooler = ( self.pooler = (
tf.keras.layers.Dense( keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -620,7 +619,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -620,7 +619,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
else None else None
) )
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings return self.embeddings
def set_input_embeddings(self, value: tf.Variable): def set_input_embeddings(self, value: tf.Variable):
...@@ -776,7 +775,7 @@ ALBERT_START_DOCSTRING = r""" ...@@ -776,7 +775,7 @@ ALBERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -942,7 +941,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss): ...@@ -942,7 +941,7 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions") self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.predictions return self.predictions
@unpack_inputs @unpack_inputs
...@@ -1032,12 +1031,12 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss): ...@@ -1032,12 +1031,12 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
self.sop_classifier.build(None) self.sop_classifier.build(None)
class TFAlbertSOPHead(tf.keras.layers.Layer): class TFAlbertSOPHead(keras.layers.Layer):
def __init__(self, config: AlbertConfig, **kwargs): def __init__(self, config: AlbertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
...@@ -1070,7 +1069,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) ...@@ -1070,7 +1069,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions") self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.predictions return self.predictions
@unpack_inputs @unpack_inputs
...@@ -1184,8 +1183,8 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -1184,8 +1183,8 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, name="albert") self.albert = TFAlbertMainLayer(config, name="albert")
self.dropout = tf.keras.layers.Dropout(rate=config.classifier_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1283,8 +1282,8 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -1283,8 +1282,8 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
if config.classifier_dropout_prob is not None if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout_prob) self.dropout = keras.layers.Dropout(rate=classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1372,7 +1371,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ...@@ -1372,7 +1371,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
self.config = config self.config = config
...@@ -1478,8 +1477,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1478,8 +1477,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.albert = TFAlbertMainLayer(config, name="albert") self.albert = TFAlbertMainLayer(config, name="albert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
......
...@@ -38,6 +38,7 @@ from ...modeling_tf_utils import ( ...@@ -38,6 +38,7 @@ from ...modeling_tf_utils import (
TFModelInputType, TFModelInputType,
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -116,7 +117,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): ...@@ -116,7 +117,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
return (one_cst - expanded_mask) * LARGE_NEGATIVE return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding): class TFBartLearnedPositionalEmbedding(keras.layers.Embedding):
""" """
This module learns positional embeddings up to a fixed maximum size. This module learns positional embeddings up to a fixed maximum size.
""" """
...@@ -143,7 +144,7 @@ class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding): ...@@ -143,7 +144,7 @@ class TFBartLearnedPositionalEmbedding(tf.keras.layers.Embedding):
return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype)) return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype))
class TFBartAttention(tf.keras.layers.Layer): class TFBartAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need""" """Multi-headed attention from "Attention Is All You Need"""
def __init__( def __init__(
...@@ -159,7 +160,7 @@ class TFBartAttention(tf.keras.layers.Layer): ...@@ -159,7 +160,7 @@ class TFBartAttention(tf.keras.layers.Layer):
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim: if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError( raise ValueError(
...@@ -169,10 +170,10 @@ class TFBartAttention(tf.keras.layers.Layer): ...@@ -169,10 +170,10 @@ class TFBartAttention(tf.keras.layers.Layer):
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
...@@ -313,20 +314,20 @@ class TFBartAttention(tf.keras.layers.Layer): ...@@ -313,20 +314,20 @@ class TFBartAttention(tf.keras.layers.Layer):
self.out_proj.build([None, None, self.embed_dim]) self.out_proj.build([None, None, self.embed_dim])
class TFBartEncoderLayer(tf.keras.layers.Layer): class TFBartEncoderLayer(keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs): def __init__(self, config: BartConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
self.self_attn = TFBartAttention( self.self_attn = TFBartAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
) )
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -390,7 +391,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer): ...@@ -390,7 +391,7 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
self.final_layer_norm.build([None, None, self.embed_dim]) self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartDecoderLayer(tf.keras.layers.Layer): class TFBartDecoderLayer(keras.layers.Layer):
def __init__(self, config: BartConfig, **kwargs): def __init__(self, config: BartConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
...@@ -401,11 +402,11 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ...@@ -401,11 +402,11 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
name="self_attn", name="self_attn",
is_decoder=True, is_decoder=True,
) )
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBartAttention( self.encoder_attn = TFBartAttention(
self.embed_dim, self.embed_dim,
config.decoder_attention_heads, config.decoder_attention_heads,
...@@ -413,10 +414,10 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ...@@ -413,10 +414,10 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
name="encoder_attn", name="encoder_attn",
is_decoder=True, is_decoder=True,
) )
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -526,21 +527,21 @@ class TFBartDecoderLayer(tf.keras.layers.Layer): ...@@ -526,21 +527,21 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
self.final_layer_norm.build([None, None, self.embed_dim]) self.final_layer_norm.build([None, None, self.embed_dim])
class TFBartClassificationHead(tf.keras.layers.Layer): class TFBartClassificationHead(keras.layers.Layer):
"""Head for sentence-level classification tasks.""" """Head for sentence-level classification tasks."""
def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name: str, **kwargs): def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name: str, **kwargs):
super().__init__(name=name, **kwargs) super().__init__(name=name, **kwargs)
self.dense = tf.keras.layers.Dense(inner_dim, name="dense") self.dense = keras.layers.Dense(inner_dim, name="dense")
self.dropout = tf.keras.layers.Dropout(pooler_dropout) self.dropout = keras.layers.Dropout(pooler_dropout)
self.out_proj = tf.keras.layers.Dense(num_classes, name="out_proj") self.out_proj = keras.layers.Dense(num_classes, name="out_proj")
self.input_dim = inner_dim self.input_dim = inner_dim
self.inner_dim = inner_dim self.inner_dim = inner_dim
def call(self, inputs): def call(self, inputs):
hidden_states = self.dropout(inputs) hidden_states = self.dropout(inputs)
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
hidden_states = tf.keras.activations.tanh(hidden_states) hidden_states = keras.activations.tanh(hidden_states)
hidden_states = self.dropout(hidden_states) hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states) hidden_states = self.out_proj(hidden_states)
return hidden_states return hidden_states
...@@ -583,7 +584,7 @@ BART_START_DOCSTRING = r""" ...@@ -583,7 +584,7 @@ BART_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -740,7 +741,7 @@ BART_INPUTS_DOCSTRING = r""" ...@@ -740,7 +741,7 @@ BART_INPUTS_DOCSTRING = r"""
@keras_serializable @keras_serializable
class TFBartEncoder(tf.keras.layers.Layer): class TFBartEncoder(keras.layers.Layer):
config_class = BartConfig config_class = BartConfig
""" """
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
...@@ -750,10 +751,10 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -750,10 +751,10 @@ class TFBartEncoder(tf.keras.layers.Layer):
config: BartConfig config: BartConfig
""" """
def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings self.max_source_positions = config.max_position_embeddings
...@@ -766,7 +767,7 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -766,7 +767,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
name="embed_positions", name="embed_positions",
) )
self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model self.embed_dim = config.d_model
@unpack_inputs @unpack_inputs
...@@ -900,7 +901,7 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -900,7 +901,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBartDecoder(tf.keras.layers.Layer): class TFBartDecoder(keras.layers.Layer):
config_class = BartConfig config_class = BartConfig
""" """
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`] Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`]
...@@ -910,7 +911,7 @@ class TFBartDecoder(tf.keras.layers.Layer): ...@@ -910,7 +911,7 @@ class TFBartDecoder(tf.keras.layers.Layer):
embed_tokens: output embedding embed_tokens: output embedding
""" """
def __init__(self, config: BartConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
...@@ -923,9 +924,9 @@ class TFBartDecoder(tf.keras.layers.Layer): ...@@ -923,9 +924,9 @@ class TFBartDecoder(tf.keras.layers.Layer):
) )
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
@unpack_inputs @unpack_inputs
def call( def call(
...@@ -1130,16 +1131,16 @@ class TFBartDecoder(tf.keras.layers.Layer): ...@@ -1130,16 +1131,16 @@ class TFBartDecoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBartMainLayer(tf.keras.layers.Layer): class TFBartMainLayer(keras.layers.Layer):
config_class = BartConfig config_class = BartConfig
def __init__(self, config: BartConfig, load_weight_prefix=None, **kwargs): def __init__(self, config: BartConfig, load_weight_prefix=None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.shared = tf.keras.layers.Embedding( self.shared = keras.layers.Embedding(
input_dim=config.vocab_size, input_dim=config.vocab_size,
output_dim=config.d_model, output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared", name="model.shared",
) )
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
...@@ -1358,9 +1359,9 @@ class TFBartModel(TFBartPretrainedModel): ...@@ -1358,9 +1359,9 @@ class TFBartModel(TFBartPretrainedModel):
self.model.build(None) self.model.build(None)
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(keras.layers.Layer):
""" """
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer. so all weights have to be registered in a layer.
""" """
......
...@@ -81,7 +81,7 @@ def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name ...@@ -81,7 +81,7 @@ def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name
if any(x in var_name for x in tensors_to_transpose): if any(x in var_name for x in tensors_to_transpose):
torch_tensor = torch_tensor.T torch_tensor = torch_tensor.T
tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
tf.keras.backend.set_value(tf_var, torch_tensor) tf_var.assign(tf.cast(torch_tensor, tf_var.dtype))
tf_weight = session.run(tf_var) tf_weight = session.run(tf_var)
print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}") print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}")
......
...@@ -49,6 +49,7 @@ from ...modeling_tf_utils import ( ...@@ -49,6 +49,7 @@ from ...modeling_tf_utils import (
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -121,9 +122,7 @@ class TFBertPreTrainingLoss: ...@@ -121,9 +122,7 @@ class TFBertPreTrainingLoss:
""" """
def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE)
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
# Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway
unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0]) unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0])
...@@ -143,7 +142,7 @@ class TFBertPreTrainingLoss: ...@@ -143,7 +142,7 @@ class TFBertPreTrainingLoss:
return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,)) return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,))
class TFBertEmbeddings(tf.keras.layers.Layer): class TFBertEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
...@@ -153,8 +152,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -153,8 +152,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
def build(self, input_shape=None): def build(self, input_shape=None):
with tf.name_scope("word_embeddings"): with tf.name_scope("word_embeddings"):
...@@ -226,7 +225,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -226,7 +225,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
class TFBertSelfAttention(tf.keras.layers.Layer): class TFBertSelfAttention(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -241,16 +240,16 @@ class TFBertSelfAttention(tf.keras.layers.Layer): ...@@ -241,16 +240,16 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.config = config self.config = config
...@@ -358,15 +357,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer): ...@@ -358,15 +357,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
self.value.build([None, None, self.config.hidden_size]) self.value.build([None, None, self.config.hidden_size])
class TFBertSelfOutput(tf.keras.layers.Layer): class TFBertSelfOutput(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -388,7 +387,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer): ...@@ -388,7 +387,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertAttention(tf.keras.layers.Layer): class TFBertAttention(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -439,11 +438,11 @@ class TFBertAttention(tf.keras.layers.Layer): ...@@ -439,11 +438,11 @@ class TFBertAttention(tf.keras.layers.Layer):
self.dense_output.build(None) self.dense_output.build(None)
class TFBertIntermediate(tf.keras.layers.Layer): class TFBertIntermediate(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -468,15 +467,15 @@ class TFBertIntermediate(tf.keras.layers.Layer): ...@@ -468,15 +467,15 @@ class TFBertIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFBertOutput(tf.keras.layers.Layer): class TFBertOutput(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -498,7 +497,7 @@ class TFBertOutput(tf.keras.layers.Layer): ...@@ -498,7 +497,7 @@ class TFBertOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLayer(tf.keras.layers.Layer): class TFBertLayer(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -601,7 +600,7 @@ class TFBertLayer(tf.keras.layers.Layer): ...@@ -601,7 +600,7 @@ class TFBertLayer(tf.keras.layers.Layer):
self.crossattention.build(None) self.crossattention.build(None)
class TFBertEncoder(tf.keras.layers.Layer): class TFBertEncoder(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -679,11 +678,11 @@ class TFBertEncoder(tf.keras.layers.Layer): ...@@ -679,11 +678,11 @@ class TFBertEncoder(tf.keras.layers.Layer):
layer.build(None) layer.build(None)
class TFBertPooler(tf.keras.layers.Layer): class TFBertPooler(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -708,11 +707,11 @@ class TFBertPooler(tf.keras.layers.Layer): ...@@ -708,11 +707,11 @@ class TFBertPooler(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFBertPredictionHeadTransform(tf.keras.layers.Layer): class TFBertPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
...@@ -723,7 +722,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -723,7 +722,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
...@@ -745,8 +744,8 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -745,8 +744,8 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBertLMPredictionHead(tf.keras.layers.Layer): class TFBertLMPredictionHead(keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -768,7 +767,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -768,7 +767,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
with tf.name_scope(self.transform.name): with tf.name_scope(self.transform.name):
self.transform.build(None) self.transform.build(None)
def get_output_embeddings(self) -> tf.keras.layers.Layer: def get_output_embeddings(self) -> keras.layers.Layer:
return self.input_embeddings return self.input_embeddings
def set_output_embeddings(self, value: tf.Variable): def set_output_embeddings(self, value: tf.Variable):
...@@ -793,8 +792,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): ...@@ -793,8 +792,8 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFBertMLMHead(tf.keras.layers.Layer): class TFBertMLMHead(keras.layers.Layer):
def __init__(self, config: BertConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
...@@ -813,11 +812,11 @@ class TFBertMLMHead(tf.keras.layers.Layer): ...@@ -813,11 +812,11 @@ class TFBertMLMHead(tf.keras.layers.Layer):
self.predictions.build(None) self.predictions.build(None)
class TFBertNSPHead(tf.keras.layers.Layer): class TFBertNSPHead(keras.layers.Layer):
def __init__(self, config: BertConfig, **kwargs): def __init__(self, config: BertConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.seq_relationship = tf.keras.layers.Dense( self.seq_relationship = keras.layers.Dense(
units=2, units=2,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="seq_relationship", name="seq_relationship",
...@@ -839,7 +838,7 @@ class TFBertNSPHead(tf.keras.layers.Layer): ...@@ -839,7 +838,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBertMainLayer(tf.keras.layers.Layer): class TFBertMainLayer(keras.layers.Layer):
config_class = BertConfig config_class = BertConfig
def __init__(self, config: BertConfig, add_pooling_layer: bool = True, **kwargs): def __init__(self, config: BertConfig, add_pooling_layer: bool = True, **kwargs):
...@@ -852,7 +851,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -852,7 +851,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
self.encoder = TFBertEncoder(config, name="encoder") self.encoder = TFBertEncoder(config, name="encoder")
self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.embeddings return self.embeddings
def set_input_embeddings(self, value: tf.Variable): def set_input_embeddings(self, value: tf.Variable):
...@@ -1086,7 +1085,7 @@ BERT_START_DOCSTRING = r""" ...@@ -1086,7 +1085,7 @@ BERT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -1281,7 +1280,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): ...@@ -1281,7 +1280,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
self.nsp = TFBertNSPHead(config, name="nsp___cls") self.nsp = TFBertNSPHead(config, name="nsp___cls")
self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
def get_prefix_bias_name(self) -> str: def get_prefix_bias_name(self) -> str:
...@@ -1407,7 +1406,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -1407,7 +1406,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
def get_prefix_bias_name(self) -> str: def get_prefix_bias_name(self) -> str:
...@@ -1500,7 +1499,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1500,7 +1499,7 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls")
def get_lm_head(self) -> tf.keras.layers.Layer: def get_lm_head(self) -> keras.layers.Layer:
return self.mlm.predictions return self.mlm.predictions
def get_prefix_bias_name(self) -> str: def get_prefix_bias_name(self) -> str:
...@@ -1732,8 +1731,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -1732,8 +1731,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout) self.dropout = keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
...@@ -1825,8 +1824,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1825,8 +1824,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
self.config = config self.config = config
...@@ -1947,8 +1946,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1947,8 +1946,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
classifier_dropout = ( classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
) )
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout) self.dropout = keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="classifier", name="classifier",
...@@ -2045,7 +2044,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ...@@ -2045,7 +2044,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = keras.layers.Dense(
units=config.num_labels, units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="qa_outputs", name="qa_outputs",
......
...@@ -5,10 +5,11 @@ import tensorflow as tf ...@@ -5,10 +5,11 @@ import tensorflow as tf
from tensorflow_text import BertTokenizer as BertTokenizerLayer from tensorflow_text import BertTokenizer as BertTokenizerLayer
from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs
from ...modeling_tf_utils import keras
from .tokenization_bert import BertTokenizer from .tokenization_bert import BertTokenizer
class TFBertTokenizer(tf.keras.layers.Layer): class TFBertTokenizer(keras.layers.Layer):
""" """
This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
`from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
......
...@@ -36,6 +36,7 @@ from ...modeling_tf_outputs import ( ...@@ -36,6 +36,7 @@ from ...modeling_tf_outputs import (
from ...modeling_tf_utils import ( from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): ...@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
return (one_cst - expanded_mask) * LARGE_NEGATIVE return (one_cst - expanded_mask) * LARGE_NEGATIVE
class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding): class TFBlenderbotLearnedPositionalEmbedding(keras.layers.Embedding):
""" """
This module learns positional embeddings up to a fixed maximum size. This module learns positional embeddings up to a fixed maximum size.
""" """
...@@ -138,7 +139,7 @@ class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding): ...@@ -138,7 +139,7 @@ class TFBlenderbotLearnedPositionalEmbedding(tf.keras.layers.Embedding):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Blenderbot # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Blenderbot
class TFBlenderbotAttention(tf.keras.layers.Layer): class TFBlenderbotAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need""" """Multi-headed attention from "Attention Is All You Need"""
def __init__( def __init__(
...@@ -154,7 +155,7 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): ...@@ -154,7 +155,7 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim: if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError( raise ValueError(
...@@ -164,10 +165,10 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): ...@@ -164,10 +165,10 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
...@@ -309,20 +310,20 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): ...@@ -309,20 +310,20 @@ class TFBlenderbotAttention(tf.keras.layers.Layer):
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot
class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): class TFBlenderbotEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotConfig, **kwargs): def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
self.self_attn = TFBlenderbotAttention( self.self_attn = TFBlenderbotAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
) )
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -387,7 +388,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer): ...@@ -387,7 +388,7 @@ class TFBlenderbotEncoderLayer(tf.keras.layers.Layer):
# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot
class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): class TFBlenderbotDecoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotConfig, **kwargs): def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
...@@ -398,11 +399,11 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): ...@@ -398,11 +399,11 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
name="self_attn", name="self_attn",
is_decoder=True, is_decoder=True,
) )
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBlenderbotAttention( self.encoder_attn = TFBlenderbotAttention(
self.embed_dim, self.embed_dim,
config.decoder_attention_heads, config.decoder_attention_heads,
...@@ -410,10 +411,10 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer): ...@@ -410,10 +411,10 @@ class TFBlenderbotDecoderLayer(tf.keras.layers.Layer):
name="encoder_attn", name="encoder_attn",
is_decoder=True, is_decoder=True,
) )
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -533,7 +534,7 @@ BLENDERBOT_START_DOCSTRING = r""" ...@@ -533,7 +534,7 @@ BLENDERBOT_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -677,7 +678,7 @@ BLENDERBOT_INPUTS_DOCSTRING = r""" ...@@ -677,7 +678,7 @@ BLENDERBOT_INPUTS_DOCSTRING = r"""
@keras_serializable @keras_serializable
class TFBlenderbotEncoder(tf.keras.layers.Layer): class TFBlenderbotEncoder(keras.layers.Layer):
config_class = BlenderbotConfig config_class = BlenderbotConfig
""" """
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
...@@ -687,10 +688,10 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): ...@@ -687,10 +688,10 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
config: BlenderbotConfig config: BlenderbotConfig
""" """
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings self.max_source_positions = config.max_position_embeddings
...@@ -703,7 +704,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): ...@@ -703,7 +704,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
name="embed_positions", name="embed_positions",
) )
self.layers = [TFBlenderbotEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] self.layers = [TFBlenderbotEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
def get_embed_tokens(self): def get_embed_tokens(self):
return self.embed_tokens return self.embed_tokens
...@@ -849,7 +850,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer): ...@@ -849,7 +850,7 @@ class TFBlenderbotEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBlenderbotDecoder(tf.keras.layers.Layer): class TFBlenderbotDecoder(keras.layers.Layer):
config_class = BlenderbotConfig config_class = BlenderbotConfig
""" """
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`] Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`]
...@@ -859,7 +860,7 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): ...@@ -859,7 +860,7 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
embed_tokens: output embedding embed_tokens: output embedding
""" """
def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs): def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
...@@ -872,9 +873,9 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): ...@@ -872,9 +873,9 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
) )
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBlenderbotDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] self.layers = [TFBlenderbotDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
def get_embed_tokens(self): def get_embed_tokens(self):
return self.embed_tokens return self.embed_tokens
...@@ -1090,17 +1091,17 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer): ...@@ -1090,17 +1091,17 @@ class TFBlenderbotDecoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBlenderbotMainLayer(tf.keras.layers.Layer): class TFBlenderbotMainLayer(keras.layers.Layer):
config_class = BlenderbotConfig config_class = BlenderbotConfig
def __init__(self, config: BlenderbotConfig, **kwargs): def __init__(self, config: BlenderbotConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.shared = tf.keras.layers.Embedding( self.shared = keras.layers.Embedding(
input_dim=config.vocab_size, input_dim=config.vocab_size,
output_dim=config.d_model, output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared", name="model.shared",
) )
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
...@@ -1325,9 +1326,9 @@ class TFBlenderbotModel(TFBlenderbotPreTrainedModel): ...@@ -1325,9 +1326,9 @@ class TFBlenderbotModel(TFBlenderbotPreTrainedModel):
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(keras.layers.Layer):
""" """
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer. so all weights have to be registered in a layer.
""" """
......
...@@ -35,6 +35,7 @@ from ...modeling_tf_outputs import ( ...@@ -35,6 +35,7 @@ from ...modeling_tf_outputs import (
from ...modeling_tf_utils import ( from ...modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
keras,
keras_serializable, keras_serializable,
unpack_inputs, unpack_inputs,
) )
...@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): ...@@ -117,7 +118,7 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
# Copied from transformers.models.blenderbot.modeling_tf_blenderbot.TFBlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall # Copied from transformers.models.blenderbot.modeling_tf_blenderbot.TFBlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall
class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding): class TFBlenderbotSmallLearnedPositionalEmbedding(keras.layers.Embedding):
""" """
This module learns positional embeddings up to a fixed maximum size. This module learns positional embeddings up to a fixed maximum size.
""" """
...@@ -138,7 +139,7 @@ class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding): ...@@ -138,7 +139,7 @@ class TFBlenderbotSmallLearnedPositionalEmbedding(tf.keras.layers.Embedding):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->BlenderbotSmall # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->BlenderbotSmall
class TFBlenderbotSmallAttention(tf.keras.layers.Layer): class TFBlenderbotSmallAttention(keras.layers.Layer):
"""Multi-headed attention from "Attention Is All You Need""" """Multi-headed attention from "Attention Is All You Need"""
def __init__( def __init__(
...@@ -154,7 +155,7 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): ...@@ -154,7 +155,7 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.num_heads = num_heads self.num_heads = num_heads
self.dropout = tf.keras.layers.Dropout(dropout) self.dropout = keras.layers.Dropout(dropout)
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
if (self.head_dim * num_heads) != self.embed_dim: if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError( raise ValueError(
...@@ -164,10 +165,10 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): ...@@ -164,10 +165,10 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder self.is_decoder = is_decoder
self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
...@@ -309,20 +310,20 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): ...@@ -309,20 +310,20 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall # Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): class TFBlenderbotSmallEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotSmallConfig, **kwargs): def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
self.self_attn = TFBlenderbotSmallAttention( self.self_attn = TFBlenderbotSmallAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
) )
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -387,7 +388,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer): ...@@ -387,7 +388,7 @@ class TFBlenderbotSmallEncoderLayer(tf.keras.layers.Layer):
# Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall # Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall
class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): class TFBlenderbotSmallDecoderLayer(keras.layers.Layer):
def __init__(self, config: BlenderbotSmallConfig, **kwargs): def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.d_model self.embed_dim = config.d_model
...@@ -398,11 +399,11 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): ...@@ -398,11 +399,11 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
name="self_attn", name="self_attn",
is_decoder=True, is_decoder=True,
) )
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.activation_fn = get_tf_activation(config.activation_function) self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBlenderbotSmallAttention( self.encoder_attn = TFBlenderbotSmallAttention(
self.embed_dim, self.embed_dim,
config.decoder_attention_heads, config.decoder_attention_heads,
...@@ -410,10 +411,10 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer): ...@@ -410,10 +411,10 @@ class TFBlenderbotSmallDecoderLayer(tf.keras.layers.Layer):
name="encoder_attn", name="encoder_attn",
is_decoder=True, is_decoder=True,
) )
self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
self.config = config self.config = config
def call( def call(
...@@ -533,7 +534,7 @@ BLENDERBOT_SMALL_START_DOCSTRING = r""" ...@@ -533,7 +534,7 @@ BLENDERBOT_SMALL_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -681,7 +682,7 @@ BLENDERBOT_SMALL_INPUTS_DOCSTRING = r""" ...@@ -681,7 +682,7 @@ BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
@keras_serializable @keras_serializable
class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): class TFBlenderbotSmallEncoder(keras.layers.Layer):
config_class = BlenderbotSmallConfig config_class = BlenderbotSmallConfig
""" """
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
...@@ -691,12 +692,10 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): ...@@ -691,12 +692,10 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
config: BlenderbotSmallConfig config: BlenderbotSmallConfig
""" """
def __init__( def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
self, config: BlenderbotSmallConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs
):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop self.layerdrop = config.encoder_layerdrop
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings self.max_source_positions = config.max_position_embeddings
...@@ -709,7 +708,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): ...@@ -709,7 +708,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
name="embed_positions", name="embed_positions",
) )
self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.embed_dim = config.d_model self.embed_dim = config.d_model
def get_embed_tokens(self): def get_embed_tokens(self):
...@@ -855,7 +854,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer): ...@@ -855,7 +854,7 @@ class TFBlenderbotSmallEncoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): class TFBlenderbotSmallDecoder(keras.layers.Layer):
config_class = BlenderbotSmallConfig config_class = BlenderbotSmallConfig
""" """
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotSmallDecoderLayer`] Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotSmallDecoderLayer`]
...@@ -865,9 +864,7 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): ...@@ -865,9 +864,7 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
embed_tokens: output embedding embed_tokens: output embedding
""" """
def __init__( def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs):
self, config: BlenderbotSmallConfig, embed_tokens: Optional[tf.keras.layers.Embedding] = None, **kwargs
):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.padding_idx = config.pad_token_id self.padding_idx = config.pad_token_id
...@@ -880,9 +877,9 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): ...@@ -880,9 +877,9 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
) )
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBlenderbotSmallDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] self.layers = [TFBlenderbotSmallDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = keras.layers.Dropout(config.dropout)
def get_embed_tokens(self): def get_embed_tokens(self):
return self.embed_tokens return self.embed_tokens
...@@ -1095,17 +1092,17 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer): ...@@ -1095,17 +1092,17 @@ class TFBlenderbotSmallDecoder(tf.keras.layers.Layer):
@keras_serializable @keras_serializable
class TFBlenderbotSmallMainLayer(tf.keras.layers.Layer): class TFBlenderbotSmallMainLayer(keras.layers.Layer):
config_class = BlenderbotSmallConfig config_class = BlenderbotSmallConfig
def __init__(self, config: BlenderbotSmallConfig, **kwargs): def __init__(self, config: BlenderbotSmallConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.shared = tf.keras.layers.Embedding( self.shared = keras.layers.Embedding(
input_dim=config.vocab_size, input_dim=config.vocab_size,
output_dim=config.d_model, output_dim=config.d_model,
embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.config.init_std), embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
name="model.shared", name="model.shared",
) )
# Additional attribute to specify the expected name scope of the layer (for loading/storing weights) # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
...@@ -1314,9 +1311,9 @@ class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel): ...@@ -1314,9 +1311,9 @@ class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel):
# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer # Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
class BiasLayer(tf.keras.layers.Layer): class BiasLayer(keras.layers.Layer):
""" """
Bias as a layer. It is used for serialization purposes: `tf.keras.Model.save_weights` stores on a per-layer basis, Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
so all weights have to be registered in a layer. so all weights have to be registered in a layer.
""" """
......
...@@ -27,6 +27,7 @@ from ...modeling_tf_utils import ( ...@@ -27,6 +27,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
get_initializer, get_initializer,
get_tf_activation, get_tf_activation,
keras,
keras_serializable, keras_serializable,
shape_list, shape_list,
unpack_inputs, unpack_inputs,
...@@ -63,7 +64,7 @@ TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -63,7 +64,7 @@ TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss # Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
return tf.math.reduce_mean( return tf.math.reduce_mean(
tf.keras.metrics.sparse_categorical_crossentropy( keras.metrics.sparse_categorical_crossentropy(
y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
) )
) )
...@@ -234,7 +235,7 @@ class TFBlipOutput(ModelOutput): ...@@ -234,7 +235,7 @@ class TFBlipOutput(ModelOutput):
) )
class TFBlipVisionEmbeddings(tf.keras.layers.Layer): class TFBlipVisionEmbeddings(keras.layers.Layer):
def __init__(self, config: BlipVisionConfig, **kwargs): def __init__(self, config: BlipVisionConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -242,7 +243,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer): ...@@ -242,7 +243,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
self.image_size = config.image_size self.image_size = config.image_size
self.patch_size = config.patch_size self.patch_size = config.patch_size
self.patch_embedding = tf.keras.layers.Conv2D( self.patch_embedding = keras.layers.Conv2D(
filters=self.embed_dim, filters=self.embed_dim,
kernel_size=self.patch_size, kernel_size=self.patch_size,
strides=self.patch_size, strides=self.patch_size,
...@@ -291,7 +292,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer): ...@@ -291,7 +292,7 @@ class TFBlipVisionEmbeddings(tf.keras.layers.Layer):
# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip # Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip
class TFBlipTextEmbeddings(tf.keras.layers.Layer): class TFBlipTextEmbeddings(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -349,7 +350,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): ...@@ -349,7 +350,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
return final_embeddings return final_embeddings
class TFBlipAttention(tf.keras.layers.Layer): class TFBlipAttention(keras.layers.Layer):
"""Multi-headed attention from 'Attention Is All You Need' paper""" """Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
...@@ -364,13 +365,13 @@ class TFBlipAttention(tf.keras.layers.Layer): ...@@ -364,13 +365,13 @@ class TFBlipAttention(tf.keras.layers.Layer):
f" {self.num_heads})." f" {self.num_heads})."
) )
self.scale = self.head_dim**-0.5 self.scale = self.head_dim**-0.5
self.dropout = tf.keras.layers.Dropout(config.attention_dropout, name="dropout") self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout")
self.qkv = tf.keras.layers.Dense( self.qkv = keras.layers.Dense(
3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv" 3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
) )
self.projection = tf.keras.layers.Dense( self.projection = keras.layers.Dense(
self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
) )
...@@ -433,7 +434,7 @@ class TFBlipAttention(tf.keras.layers.Layer): ...@@ -433,7 +434,7 @@ class TFBlipAttention(tf.keras.layers.Layer):
self.projection.build([None, None, self.embed_dim]) self.projection.build([None, None, self.embed_dim])
class TFBlipMLP(tf.keras.layers.Layer): class TFBlipMLP(keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs): def __init__(self, config: BlipConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -442,10 +443,10 @@ class TFBlipMLP(tf.keras.layers.Layer): ...@@ -442,10 +443,10 @@ class TFBlipMLP(tf.keras.layers.Layer):
in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5)
fc_std = (2 * config.hidden_size) ** -0.5 fc_std = (2 * config.hidden_size) ** -0.5
self.fc1 = tf.keras.layers.Dense( self.fc1 = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
) )
self.fc2 = tf.keras.layers.Dense( self.fc2 = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
) )
self.config = config self.config = config
...@@ -468,14 +469,14 @@ class TFBlipMLP(tf.keras.layers.Layer): ...@@ -468,14 +469,14 @@ class TFBlipMLP(tf.keras.layers.Layer):
self.fc2.build([None, None, self.config.intermediate_size]) self.fc2.build([None, None, self.config.intermediate_size])
class TFBlipEncoderLayer(tf.keras.layers.Layer): class TFBlipEncoderLayer(keras.layers.Layer):
def __init__(self, config: BlipConfig, **kwargs): def __init__(self, config: BlipConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
self.self_attn = TFBlipAttention(config, name="self_attn") self.self_attn = TFBlipAttention(config, name="self_attn")
self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
self.mlp = TFBlipMLP(config, name="mlp") self.mlp = TFBlipMLP(config, name="mlp")
self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
def call( def call(
self, self,
...@@ -551,7 +552,7 @@ BLIP_START_DOCSTRING = r""" ...@@ -551,7 +552,7 @@ BLIP_START_DOCSTRING = r"""
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) etc.)
This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior. behavior.
...@@ -614,7 +615,7 @@ BLIP_INPUTS_DOCSTRING = r""" ...@@ -614,7 +615,7 @@ BLIP_INPUTS_DOCSTRING = r"""
@keras_serializable @keras_serializable
class TFBlipEncoder(tf.keras.layers.Layer): class TFBlipEncoder(keras.layers.Layer):
config_class = BlipConfig config_class = BlipConfig
""" """
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
...@@ -714,7 +715,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel): ...@@ -714,7 +715,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings") self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings")
self.encoder = TFBlipEncoder(config, name="encoder") self.encoder = TFBlipEncoder(config, name="encoder")
self.post_layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
self.embed_dim = config.hidden_size self.embed_dim = config.hidden_size
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
...@@ -798,7 +799,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel): ...@@ -798,7 +799,7 @@ class TFBlipVisionModel(TFBlipPreTrainedModel):
self.post_layernorm.build([None, None, self.embed_dim]) self.post_layernorm.build([None, None, self.embed_dim])
class TFBlipMainLayer(tf.keras.layers.Layer): class TFBlipMainLayer(keras.layers.Layer):
config_class = BlipConfig config_class = BlipConfig
def __init__(self, config: BlipConfig, *args, **kwargs): def __init__(self, config: BlipConfig, *args, **kwargs):
...@@ -826,13 +827,13 @@ class TFBlipMainLayer(tf.keras.layers.Layer): ...@@ -826,13 +827,13 @@ class TFBlipMainLayer(tf.keras.layers.Layer):
self.text_model = TFBlipTextModel(text_config, name="text_model") self.text_model = TFBlipTextModel(text_config, name="text_model")
self.vision_model = TFBlipVisionModel(vision_config, name="vision_model") self.vision_model = TFBlipVisionModel(vision_config, name="vision_model")
self.visual_projection = tf.keras.layers.Dense( self.visual_projection = keras.layers.Dense(
self.projection_dim, self.projection_dim,
use_bias=False, use_bias=False,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="visual_projection", name="visual_projection",
) )
self.text_projection = tf.keras.layers.Dense( self.text_projection = keras.layers.Dense(
self.projection_dim, self.projection_dim,
use_bias=False, use_bias=False,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
...@@ -845,7 +846,7 @@ class TFBlipMainLayer(tf.keras.layers.Layer): ...@@ -845,7 +846,7 @@ class TFBlipMainLayer(tf.keras.layers.Layer):
self.logit_scale = self.add_weight( self.logit_scale = self.add_weight(
name="logit_scale", name="logit_scale",
shape=[], shape=[],
initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
trainable=True, trainable=True,
) )
...@@ -1116,7 +1117,7 @@ class TFBlipForConditionalGeneration(TFBlipPreTrainedModel): ...@@ -1116,7 +1117,7 @@ class TFBlipForConditionalGeneration(TFBlipPreTrainedModel):
self.decoder_input_ids = config.text_config.bos_token_id self.decoder_input_ids = config.text_config.bos_token_id
self.decoder_pad_token_id = config.text_config.pad_token_id self.decoder_pad_token_id = config.text_config.pad_token_id
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding return self.vision_model.embeddings.patch_embedding
@unpack_inputs @unpack_inputs
...@@ -1307,7 +1308,7 @@ class TFBlipForQuestionAnswering(TFBlipPreTrainedModel): ...@@ -1307,7 +1308,7 @@ class TFBlipForQuestionAnswering(TFBlipPreTrainedModel):
self.decoder_pad_token_id = config.text_config.pad_token_id self.decoder_pad_token_id = config.text_config.pad_token_id
self.decoder_start_token_id = config.text_config.bos_token_id self.decoder_start_token_id = config.text_config.bos_token_id
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding return self.vision_model.embeddings.patch_embedding
# Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right
...@@ -1557,21 +1558,21 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): ...@@ -1557,21 +1558,21 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False) self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)
# vision projection layer # vision projection layer
self.vision_proj = tf.keras.layers.Dense( self.vision_proj = keras.layers.Dense(
config.image_text_hidden_size, config.image_text_hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="vision_proj", name="vision_proj",
) )
# text projection layer # text projection layer
self.text_proj = tf.keras.layers.Dense( self.text_proj = keras.layers.Dense(
config.image_text_hidden_size, config.image_text_hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="text_proj", name="text_proj",
) )
# image text matching head # image text matching head
self.itm_head = tf.keras.layers.Dense( self.itm_head = keras.layers.Dense(
2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head" 2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head"
) )
...@@ -1587,7 +1588,7 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): ...@@ -1587,7 +1588,7 @@ class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
) )
self.config = config self.config = config
def get_input_embeddings(self) -> tf.keras.layers.Layer: def get_input_embeddings(self) -> keras.layers.Layer:
return self.vision_model.embeddings.patch_embedding return self.vision_model.embeddings.patch_embedding
@unpack_inputs @unpack_inputs
......
...@@ -31,6 +31,7 @@ from ...modeling_tf_utils import ( ...@@ -31,6 +31,7 @@ from ...modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
get_initializer, get_initializer,
get_tf_activation, get_tf_activation,
keras,
keras_serializable, keras_serializable,
shape_list, shape_list,
unpack_inputs, unpack_inputs,
...@@ -75,18 +76,18 @@ BLIP_TEXT_INPUTS_DOCSTRING = r""" ...@@ -75,18 +76,18 @@ BLIP_TEXT_INPUTS_DOCSTRING = r"""
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52
class TFBlipTextEmbeddings(tf.keras.layers.Layer): class TFBlipTextEmbeddings(keras.layers.Layer):
"""Construct the embeddings from word and position embeddings.""" """Construct the embeddings from word and position embeddings."""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.word_embeddings = tf.keras.layers.Embedding( self.word_embeddings = keras.layers.Embedding(
config.vocab_size, config.vocab_size,
config.hidden_size, config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range), embeddings_initializer=get_initializer(config.initializer_range),
name="word_embeddings", name="word_embeddings",
) )
self.position_embeddings = tf.keras.layers.Embedding( self.position_embeddings = keras.layers.Embedding(
config.max_position_embeddings, config.max_position_embeddings,
config.hidden_size, config.hidden_size,
embeddings_initializer=get_initializer(config.initializer_range), embeddings_initializer=get_initializer(config.initializer_range),
...@@ -95,8 +96,8 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): ...@@ -95,8 +96,8 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
# self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load # self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load
# any TensorFlow checkpoint file # any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0) self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
...@@ -146,7 +147,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer): ...@@ -146,7 +147,7 @@ class TFBlipTextEmbeddings(tf.keras.layers.Layer):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
class TFBlipTextSelfAttention(tf.keras.layers.Layer): class TFBlipTextSelfAttention(keras.layers.Layer):
def __init__(self, config, is_cross_attention, **kwargs): def __init__(self, config, is_cross_attention, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -160,21 +161,21 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer): ...@@ -160,21 +161,21 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense( self.query = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
) )
self.key = tf.keras.layers.Dense( self.key = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
) )
self.value = tf.keras.layers.Dense( self.value = keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
) )
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = tf.keras.layers.Embedding( self.distance_embedding = keras.layers.Embedding(
2 * config.max_position_embeddings - 1, self.attention_head_size 2 * config.max_position_embeddings - 1, self.attention_head_size
) )
self.is_cross_attention = is_cross_attention self.is_cross_attention = is_cross_attention
...@@ -291,15 +292,15 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer): ...@@ -291,15 +292,15 @@ class TFBlipTextSelfAttention(tf.keras.layers.Layer):
self.value.build([None, None, self.config.hidden_size]) self.value.build([None, None, self.config.hidden_size])
class TFBlipTextSelfOutput(tf.keras.layers.Layer): class TFBlipTextSelfOutput(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
...@@ -322,7 +323,7 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer): ...@@ -322,7 +323,7 @@ class TFBlipTextSelfOutput(tf.keras.layers.Layer):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242
class TFBlipTextAttention(tf.keras.layers.Layer): class TFBlipTextAttention(keras.layers.Layer):
def __init__(self, config, is_cross_attention=False, **kwargs): def __init__(self, config, is_cross_attention=False, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self") self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self")
...@@ -367,11 +368,11 @@ class TFBlipTextAttention(tf.keras.layers.Layer): ...@@ -367,11 +368,11 @@ class TFBlipTextAttention(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText # Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText
class TFBlipTextIntermediate(tf.keras.layers.Layer): class TFBlipTextIntermediate(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
...@@ -396,15 +397,15 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer): ...@@ -396,15 +397,15 @@ class TFBlipTextIntermediate(tf.keras.layers.Layer):
self.dense.build([None, None, self.config.hidden_size]) self.dense.build([None, None, self.config.hidden_size])
class TFBlipTextOutput(tf.keras.layers.Layer): class TFBlipTextOutput(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
) )
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
...@@ -426,7 +427,7 @@ class TFBlipTextOutput(tf.keras.layers.Layer): ...@@ -426,7 +427,7 @@ class TFBlipTextOutput(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLayer(tf.keras.layers.Layer): class TFBlipTextLayer(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
...@@ -504,7 +505,7 @@ class TFBlipTextLayer(tf.keras.layers.Layer): ...@@ -504,7 +505,7 @@ class TFBlipTextLayer(tf.keras.layers.Layer):
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386
@keras_serializable @keras_serializable
class TFBlipTextEncoder(tf.keras.layers.Layer): class TFBlipTextEncoder(keras.layers.Layer):
config_class = BlipTextConfig config_class = BlipTextConfig
def __init__(self, config, name=None, **kwargs): def __init__(self, config, name=None, **kwargs):
...@@ -593,11 +594,11 @@ class TFBlipTextEncoder(tf.keras.layers.Layer): ...@@ -593,11 +594,11 @@ class TFBlipTextEncoder(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText # Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText
class TFBlipTextPooler(tf.keras.layers.Layer): class TFBlipTextPooler(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
activation="tanh", activation="tanh",
...@@ -623,11 +624,11 @@ class TFBlipTextPooler(tf.keras.layers.Layer): ...@@ -623,11 +624,11 @@ class TFBlipTextPooler(tf.keras.layers.Layer):
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText # Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText
class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): class TFBlipTextPredictionHeadTransform(keras.layers.Layer):
def __init__(self, config: BlipTextConfig, **kwargs): def __init__(self, config: BlipTextConfig, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense( self.dense = keras.layers.Dense(
units=config.hidden_size, units=config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="dense", name="dense",
...@@ -638,7 +639,7 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -638,7 +639,7 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
else: else:
self.transform_act_fn = config.hidden_act self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.config = config self.config = config
def call(self, hidden_states: tf.Tensor) -> tf.Tensor: def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
...@@ -660,14 +661,14 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer): ...@@ -660,14 +661,14 @@ class TFBlipTextPredictionHeadTransform(tf.keras.layers.Layer):
self.LayerNorm.build([None, None, self.config.hidden_size]) self.LayerNorm.build([None, None, self.config.hidden_size])
class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): class TFBlipTextLMPredictionHead(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.transform = TFBlipTextPredictionHeadTransform(config, name="transform") self.transform = TFBlipTextPredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
# an output-only bias for each token. # an output-only bias for each token.
self.decoder = tf.keras.layers.Dense( self.decoder = keras.layers.Dense(
config.vocab_size, config.vocab_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
name="decoder", name="decoder",
...@@ -694,7 +695,7 @@ class TFBlipTextLMPredictionHead(tf.keras.layers.Layer): ...@@ -694,7 +695,7 @@ class TFBlipTextLMPredictionHead(tf.keras.layers.Layer):
return hidden_states return hidden_states
class TFBlipTextOnlyMLMHead(tf.keras.layers.Layer): class TFBlipTextOnlyMLMHead(keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.predictions = TFBlipTextLMPredictionHead(config, name="predictions") self.predictions = TFBlipTextLMPredictionHead(config, name="predictions")
...@@ -1062,7 +1063,7 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): ...@@ -1062,7 +1063,7 @@ class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
# Keras won't give us label smoothing for sparse CE, so we de-sparsify things here # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here
# Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway)
one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32)
loss_fct = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none") loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none")
masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32)
lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores)
lm_loss *= masked_positions lm_loss *= masked_positions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment